mirror of https://github.com/CIRCL/AIL-framework
chg: [backend crawler] domains: download 1 archive by crawled (most recent)
parent
0d3d4aae1d
commit
b4f06c21f9
|
@ -21,6 +21,9 @@ import gzip
|
||||||
import redis
|
import redis
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
from io import BytesIO
|
||||||
|
import zipfile
|
||||||
|
|
||||||
import configparser
|
import configparser
|
||||||
import sys
|
import sys
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
||||||
|
@ -71,6 +74,7 @@ class HiddenServices(object):
|
||||||
self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled"))
|
self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled"))
|
||||||
self.paste_crawled_directory_name = cfg.get("Directories", "crawled")
|
self.paste_crawled_directory_name = cfg.get("Directories", "crawled")
|
||||||
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||||
|
self.screenshot_directory_screenshot = os.path.join(self.screenshot_directory, 'screenshot')
|
||||||
elif type == 'i2p':
|
elif type == 'i2p':
|
||||||
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||||
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||||
|
@ -230,6 +234,13 @@ class HiddenServices(object):
|
||||||
return l_crawled_pastes
|
return l_crawled_pastes
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
def get_item_screenshot(self, item):
|
||||||
|
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot')
|
||||||
|
if screenshot:
|
||||||
|
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
|
||||||
|
return screenshot
|
||||||
|
return ''
|
||||||
|
|
||||||
def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1):
|
def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1):
|
||||||
l_screenshot_paste = []
|
l_screenshot_paste = []
|
||||||
for paste in l_crawled_pastes:
|
for paste in l_crawled_pastes:
|
||||||
|
@ -237,9 +248,8 @@ class HiddenServices(object):
|
||||||
origin_paste = paste
|
origin_paste = paste
|
||||||
paste= paste.replace(self.paste_directory+'/', '')
|
paste= paste.replace(self.paste_directory+'/', '')
|
||||||
|
|
||||||
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot')
|
screenshot = self.get_item_screenshot(paste)
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
|
|
||||||
l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste})
|
l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste})
|
||||||
|
|
||||||
if len(l_screenshot_paste) > num_screenshot:
|
if len(l_screenshot_paste) > num_screenshot:
|
||||||
|
@ -250,6 +260,35 @@ class HiddenServices(object):
|
||||||
else:
|
else:
|
||||||
return l_screenshot_paste
|
return l_screenshot_paste
|
||||||
|
|
||||||
|
def get_all_domain_screenshot(self, l_crawled_pastes, filename=False):
|
||||||
|
l_screenshot_paste = []
|
||||||
|
for paste in l_crawled_pastes:
|
||||||
|
## FIXME: # TODO: remove me
|
||||||
|
origin_paste = paste
|
||||||
|
paste= paste.replace(self.paste_directory+'/', '')
|
||||||
|
|
||||||
|
screenshot = self.get_item_screenshot(paste)
|
||||||
|
if screenshot:
|
||||||
|
screenshot = screenshot + '.png'
|
||||||
|
screenshot_full_path = os.path.join(self.screenshot_directory_screenshot, screenshot)
|
||||||
|
if filename:
|
||||||
|
screen_file_name = os.path.basename(paste) + '.png'
|
||||||
|
l_screenshot_paste.append( (screenshot_full_path, screen_file_name) )
|
||||||
|
else:
|
||||||
|
l_screenshot_paste.append(screenshot_full_path)
|
||||||
|
return l_screenshot_paste
|
||||||
|
|
||||||
|
def get_all_item_full_path(self, l_items, filename=False):
|
||||||
|
l_full_items = []
|
||||||
|
for item in l_items:
|
||||||
|
item = os.path.join(self.PASTES_FOLDER, item)
|
||||||
|
if filename:
|
||||||
|
file_name = os.path.basename(item) + '.gz'
|
||||||
|
l_full_items.append( (item, file_name) )
|
||||||
|
else:
|
||||||
|
l_full_items.append(item)
|
||||||
|
return l_full_items
|
||||||
|
|
||||||
def get_crawled_pastes_by_date(self, date):
|
def get_crawled_pastes_by_date(self, date):
|
||||||
|
|
||||||
pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8])
|
pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8])
|
||||||
|
@ -258,6 +297,63 @@ class HiddenServices(object):
|
||||||
l_crawled_pastes = []
|
l_crawled_pastes = []
|
||||||
return l_crawled_pastes
|
return l_crawled_pastes
|
||||||
|
|
||||||
|
def get_all_har(self, l_pastes, filename=False):
|
||||||
|
all_har = []
|
||||||
|
for item in l_pastes:
|
||||||
|
if filename:
|
||||||
|
all_har.append( (self.get_item_har(item), os.path.basename(item) + '.json') )
|
||||||
|
else:
|
||||||
|
all_har.append(self.get_item_har(item))
|
||||||
|
return all_har
|
||||||
|
|
||||||
|
|
||||||
|
def get_item_har(self, item_path):
|
||||||
|
item_path = item_path.replace('{}/'.format(self.paste_crawled_directory_name), '', 1)
|
||||||
|
har_path = os.path.join(self.screenshot_directory, item_path) + '.json'
|
||||||
|
return har_path
|
||||||
|
|
||||||
|
def create_domain_basic_archive(self, l_pastes):
|
||||||
|
all_har = self.get_all_har(l_pastes, filename=True)
|
||||||
|
all_screenshot = self.get_all_domain_screenshot(l_pastes, filename=True)
|
||||||
|
all_items = self.get_all_item_full_path(l_pastes, filename=True)
|
||||||
|
|
||||||
|
# try:
|
||||||
|
|
||||||
|
# zip buffer
|
||||||
|
zip_buffer = BytesIO()
|
||||||
|
|
||||||
|
with zipfile.ZipFile(zip_buffer, "a") as zf:
|
||||||
|
|
||||||
|
#print(all_har)
|
||||||
|
self.write_in_zip_buffer(zf, all_har)
|
||||||
|
self.write_in_zip_buffer(zf, all_screenshot)
|
||||||
|
self.write_in_zip_buffer(zf, all_items)
|
||||||
|
|
||||||
|
# write map url
|
||||||
|
map_file_content = self.get_metadata_file(l_pastes).encode()
|
||||||
|
zf.writestr( '_URL_MAP_', BytesIO(map_file_content).getvalue())
|
||||||
|
|
||||||
|
zip_buffer.seek(0)
|
||||||
|
return zip_buffer
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
# return 'Server Error'
|
||||||
|
|
||||||
|
def write_in_zip_buffer(self, zf, list_file):
|
||||||
|
for file_path, file_name in list_file:
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
har_content = f.read()
|
||||||
|
zf.writestr( file_name, BytesIO(har_content).getvalue())
|
||||||
|
|
||||||
|
def get_metadata_file(self, list_items):
|
||||||
|
file_content = ''
|
||||||
|
dict_url = self.get_all_links(list_items)
|
||||||
|
for key in dict_url:
|
||||||
|
file_content = '{}\n{} : {}'.format(file_content, os.path.basename(key), dict_url[key])
|
||||||
|
return file_content
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
def get_last_crawled_pastes_fileSearch(self):
|
def get_last_crawled_pastes_fileSearch(self):
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ import os
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
from pyfaup.faup import Faup
|
from pyfaup.faup import Faup
|
||||||
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for
|
from flask import Flask, render_template, jsonify, request, send_file, Blueprint, redirect, url_for
|
||||||
|
|
||||||
from Date import Date
|
from Date import Date
|
||||||
from HiddenServices import HiddenServices
|
from HiddenServices import HiddenServices
|
||||||
|
@ -35,6 +35,7 @@ list_types=['onion', 'regular']
|
||||||
dic_type_name={'onion':'Onion', 'regular':'Website'}
|
dic_type_name={'onion':'Onion', 'regular':'Website'}
|
||||||
|
|
||||||
# ============ FUNCTIONS ============
|
# ============ FUNCTIONS ============
|
||||||
|
|
||||||
def one():
|
def one():
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@ -782,6 +783,56 @@ def show_domain():
|
||||||
origin_paste=origin_paste, origin_paste_name=origin_paste_name,
|
origin_paste=origin_paste, origin_paste_name=origin_paste_name,
|
||||||
domain_tags=domain_tags, screenshot=screenshot)
|
domain_tags=domain_tags, screenshot=screenshot)
|
||||||
|
|
||||||
|
@hiddenServices.route("/crawlers/download_domain", methods=['GET'])
|
||||||
|
def download_domain():
|
||||||
|
domain = request.args.get('domain')
|
||||||
|
epoch = request.args.get('epoch')
|
||||||
|
try:
|
||||||
|
epoch = int(epoch)
|
||||||
|
except:
|
||||||
|
epoch = None
|
||||||
|
port = request.args.get('port')
|
||||||
|
faup.decode(domain)
|
||||||
|
unpack_url = faup.get()
|
||||||
|
|
||||||
|
## TODO: # FIXME: remove me
|
||||||
|
try:
|
||||||
|
domain = unpack_url['domain'].decode()
|
||||||
|
except:
|
||||||
|
domain = unpack_url['domain']
|
||||||
|
|
||||||
|
if not port:
|
||||||
|
if unpack_url['port']:
|
||||||
|
try:
|
||||||
|
port = unpack_url['port'].decode()
|
||||||
|
except:
|
||||||
|
port = unpack_url['port']
|
||||||
|
else:
|
||||||
|
port = 80
|
||||||
|
try:
|
||||||
|
port = int(port)
|
||||||
|
except:
|
||||||
|
port = 80
|
||||||
|
type = get_type_domain(domain)
|
||||||
|
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
|
||||||
|
return '404'
|
||||||
|
# # TODO: FIXME return 404
|
||||||
|
|
||||||
|
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
|
||||||
|
|
||||||
|
h = HiddenServices(domain, type, port=port)
|
||||||
|
item_core = h.get_domain_crawled_core_item(epoch=epoch)
|
||||||
|
if item_core:
|
||||||
|
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
|
||||||
|
else:
|
||||||
|
l_pastes = []
|
||||||
|
#dict_links = h.get_all_links(l_pastes)
|
||||||
|
|
||||||
|
zip_file = h.create_domain_basic_archive(l_pastes)
|
||||||
|
|
||||||
|
return send_file(zip_file, attachment_filename='test.zip', as_attachment=True)
|
||||||
|
|
||||||
|
|
||||||
@hiddenServices.route("/hiddenServices/onion_son", methods=['GET'])
|
@hiddenServices.route("/hiddenServices/onion_son", methods=['GET'])
|
||||||
def onion_son():
|
def onion_son():
|
||||||
onion_domain = request.args.get('onion_domain')
|
onion_domain = request.args.get('onion_domain')
|
||||||
|
|
Loading…
Reference in New Issue