chg: [backend crawler] domains: download 1 archive by crawled (most recent)

pull/422/head
Terrtia 2019-06-07 13:47:44 +02:00
parent 0d3d4aae1d
commit b4f06c21f9
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
2 changed files with 150 additions and 3 deletions

View File

@ -21,6 +21,9 @@ import gzip
import redis import redis
import random import random
from io import BytesIO
import zipfile
import configparser import configparser
import sys import sys
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
@ -71,6 +74,7 @@ class HiddenServices(object):
self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled")) self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled"))
self.paste_crawled_directory_name = cfg.get("Directories", "crawled") self.paste_crawled_directory_name = cfg.get("Directories", "crawled")
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
self.screenshot_directory_screenshot = os.path.join(self.screenshot_directory, 'screenshot')
elif type == 'i2p': elif type == 'i2p':
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
@ -230,6 +234,13 @@ class HiddenServices(object):
return l_crawled_pastes return l_crawled_pastes
''' '''
def get_item_screenshot(self, item):
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot')
if screenshot:
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
return screenshot
return ''
def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1): def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1):
l_screenshot_paste = [] l_screenshot_paste = []
for paste in l_crawled_pastes: for paste in l_crawled_pastes:
@ -237,9 +248,8 @@ class HiddenServices(object):
origin_paste = paste origin_paste = paste
paste= paste.replace(self.paste_directory+'/', '') paste= paste.replace(self.paste_directory+'/', '')
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot') screenshot = self.get_item_screenshot(paste)
if screenshot: if screenshot:
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste}) l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste})
if len(l_screenshot_paste) > num_screenshot: if len(l_screenshot_paste) > num_screenshot:
@ -250,6 +260,35 @@ class HiddenServices(object):
else: else:
return l_screenshot_paste return l_screenshot_paste
def get_all_domain_screenshot(self, l_crawled_pastes, filename=False):
l_screenshot_paste = []
for paste in l_crawled_pastes:
## FIXME: # TODO: remove me
origin_paste = paste
paste= paste.replace(self.paste_directory+'/', '')
screenshot = self.get_item_screenshot(paste)
if screenshot:
screenshot = screenshot + '.png'
screenshot_full_path = os.path.join(self.screenshot_directory_screenshot, screenshot)
if filename:
screen_file_name = os.path.basename(paste) + '.png'
l_screenshot_paste.append( (screenshot_full_path, screen_file_name) )
else:
l_screenshot_paste.append(screenshot_full_path)
return l_screenshot_paste
def get_all_item_full_path(self, l_items, filename=False):
l_full_items = []
for item in l_items:
item = os.path.join(self.PASTES_FOLDER, item)
if filename:
file_name = os.path.basename(item) + '.gz'
l_full_items.append( (item, file_name) )
else:
l_full_items.append(item)
return l_full_items
def get_crawled_pastes_by_date(self, date): def get_crawled_pastes_by_date(self, date):
pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8])
@ -258,6 +297,63 @@ class HiddenServices(object):
l_crawled_pastes = [] l_crawled_pastes = []
return l_crawled_pastes return l_crawled_pastes
def get_all_har(self, l_pastes, filename=False):
all_har = []
for item in l_pastes:
if filename:
all_har.append( (self.get_item_har(item), os.path.basename(item) + '.json') )
else:
all_har.append(self.get_item_har(item))
return all_har
def get_item_har(self, item_path):
item_path = item_path.replace('{}/'.format(self.paste_crawled_directory_name), '', 1)
har_path = os.path.join(self.screenshot_directory, item_path) + '.json'
return har_path
def create_domain_basic_archive(self, l_pastes):
all_har = self.get_all_har(l_pastes, filename=True)
all_screenshot = self.get_all_domain_screenshot(l_pastes, filename=True)
all_items = self.get_all_item_full_path(l_pastes, filename=True)
# try:
# zip buffer
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, "a") as zf:
#print(all_har)
self.write_in_zip_buffer(zf, all_har)
self.write_in_zip_buffer(zf, all_screenshot)
self.write_in_zip_buffer(zf, all_items)
# write map url
map_file_content = self.get_metadata_file(l_pastes).encode()
zf.writestr( '_URL_MAP_', BytesIO(map_file_content).getvalue())
zip_buffer.seek(0)
return zip_buffer
# except Exception as e:
# print(e)
# return 'Server Error'
def write_in_zip_buffer(self, zf, list_file):
for file_path, file_name in list_file:
with open(file_path, "rb") as f:
har_content = f.read()
zf.writestr( file_name, BytesIO(har_content).getvalue())
def get_metadata_file(self, list_items):
file_content = ''
dict_url = self.get_all_links(list_items)
for key in dict_url:
file_content = '{}\n{} : {}'.format(file_content, os.path.basename(key), dict_url[key])
return file_content
''' '''
def get_last_crawled_pastes_fileSearch(self): def get_last_crawled_pastes_fileSearch(self):

View File

@ -11,7 +11,7 @@ import os
import time import time
import json import json
from pyfaup.faup import Faup from pyfaup.faup import Faup
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for from flask import Flask, render_template, jsonify, request, send_file, Blueprint, redirect, url_for
from Date import Date from Date import Date
from HiddenServices import HiddenServices from HiddenServices import HiddenServices
@ -35,6 +35,7 @@ list_types=['onion', 'regular']
dic_type_name={'onion':'Onion', 'regular':'Website'} dic_type_name={'onion':'Onion', 'regular':'Website'}
# ============ FUNCTIONS ============ # ============ FUNCTIONS ============
def one(): def one():
return 1 return 1
@ -782,6 +783,56 @@ def show_domain():
origin_paste=origin_paste, origin_paste_name=origin_paste_name, origin_paste=origin_paste, origin_paste_name=origin_paste_name,
domain_tags=domain_tags, screenshot=screenshot) domain_tags=domain_tags, screenshot=screenshot)
@hiddenServices.route("/crawlers/download_domain", methods=['GET'])
def download_domain():
domain = request.args.get('domain')
epoch = request.args.get('epoch')
try:
epoch = int(epoch)
except:
epoch = None
port = request.args.get('port')
faup.decode(domain)
unpack_url = faup.get()
## TODO: # FIXME: remove me
try:
domain = unpack_url['domain'].decode()
except:
domain = unpack_url['domain']
if not port:
if unpack_url['port']:
try:
port = unpack_url['port'].decode()
except:
port = unpack_url['port']
else:
port = 80
try:
port = int(port)
except:
port = 80
type = get_type_domain(domain)
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
return '404'
# # TODO: FIXME return 404
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
h = HiddenServices(domain, type, port=port)
item_core = h.get_domain_crawled_core_item(epoch=epoch)
if item_core:
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
else:
l_pastes = []
#dict_links = h.get_all_links(l_pastes)
zip_file = h.create_domain_basic_archive(l_pastes)
return send_file(zip_file, attachment_filename='test.zip', as_attachment=True)
@hiddenServices.route("/hiddenServices/onion_son", methods=['GET']) @hiddenServices.route("/hiddenServices/onion_son", methods=['GET'])
def onion_son(): def onion_son():
onion_domain = request.args.get('onion_domain') onion_domain = request.args.get('onion_domain')