mirror of https://github.com/CIRCL/AIL-framework
chg: [update v1.5] add background update: screenshots_crawled
parent
9868833c77
commit
e6dca7f8bf
|
@ -140,7 +140,7 @@ class TorSplashCrawler():
|
||||||
UUID = self.domains[0]+str(uuid.uuid4())
|
UUID = self.domains[0]+str(uuid.uuid4())
|
||||||
filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
|
filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
|
||||||
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
||||||
filename_har = os.path.join(self.crawled_har, UUID +'.png')
|
filename_har = os.path.join(self.crawled_har, UUID)
|
||||||
|
|
||||||
# # TODO: modify me
|
# # TODO: modify me
|
||||||
# save new paste on disk
|
# save new paste on disk
|
||||||
|
@ -180,17 +180,12 @@ class TorSplashCrawler():
|
||||||
|
|
||||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)
|
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)
|
||||||
|
|
||||||
dirname = os.path.dirname(filename_har)
|
|
||||||
if not os.path.exists(dirname):
|
|
||||||
os.makedirs(dirname)
|
|
||||||
|
|
||||||
if 'png' in response.data:
|
if 'png' in response.data:
|
||||||
size_screenshot = (len(response.data['png'])*3) /4
|
size_screenshot = (len(response.data['png'])*3) /4
|
||||||
|
|
||||||
if size_screenshot < 5000000: #bytes
|
if size_screenshot < 5000000: #bytes
|
||||||
image_content = base64.standard_b64decode(response.data['png'].encode())
|
image_content = base64.standard_b64decode(response.data['png'].encode())
|
||||||
hash = sha256(image_content).hexdigest()
|
hash = sha256(image_content).hexdigest()
|
||||||
print(hash)
|
|
||||||
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
|
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
|
||||||
filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
|
filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
|
||||||
dirname = os.path.dirname(filename_img)
|
dirname = os.path.dirname(filename_img)
|
||||||
|
@ -202,13 +197,13 @@ class TorSplashCrawler():
|
||||||
# add item metadata
|
# add item metadata
|
||||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
|
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
|
||||||
# add sha256 metadata
|
# add sha256 metadata
|
||||||
self.r_serv_onion.zincrby('screenshot:{}'.format(hash), relative_filename_paste, 1)
|
self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste)
|
||||||
|
|
||||||
if 'har' in response.data:
|
if 'har' in response.data:
|
||||||
dirname = os.path.dirname(filename_har)
|
dirname = os.path.dirname(filename_har)
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
os.makedirs(dirname)
|
os.makedirs(dirname)
|
||||||
with open(filename_har+'har.txt', 'wb') as f:
|
with open(filename_har+'.json', 'wb') as f:
|
||||||
f.write(json.dumps(response.data['har']).encode())
|
f.write(json.dumps(response.data['har']).encode())
|
||||||
|
|
||||||
# save external links in set
|
# save external links in set
|
||||||
|
|
|
@ -31,7 +31,7 @@ if __name__ == "__main__":
|
||||||
db=cfg.getint("ARDB_DB", "db"),
|
db=cfg.getint("ARDB_DB", "db"),
|
||||||
decode_responses=True)
|
decode_responses=True)
|
||||||
|
|
||||||
if r_serv.scard('ail:update_v1.5') != 4:
|
if r_serv.scard('ail:update_v1.5') != 5:
|
||||||
r_serv.delete('ail:update_error')
|
r_serv.delete('ail:update_error')
|
||||||
r_serv.set('ail:update_in_progress', 'v1.5')
|
r_serv.set('ail:update_in_progress', 'v1.5')
|
||||||
r_serv.set('ail:current_background_update', 'v1.5')
|
r_serv.set('ail:current_background_update', 'v1.5')
|
||||||
|
@ -50,7 +50,10 @@ if __name__ == "__main__":
|
||||||
if not r_serv.sismember('ail:update_v1.5', 'tags_background'):
|
if not r_serv.sismember('ail:update_v1.5', 'tags_background'):
|
||||||
update_file = os.path.join(os.environ['AIL_HOME'], 'update', 'v1.4', 'Update-ARDB_Tags_background.py')
|
update_file = os.path.join(os.environ['AIL_HOME'], 'update', 'v1.4', 'Update-ARDB_Tags_background.py')
|
||||||
process = subprocess.run(['python' ,update_file])
|
process = subprocess.run(['python' ,update_file])
|
||||||
if r_serv.scard('ail:update_v1.5') != 4:
|
if not r_serv.sismember('ail:update_v1.5', 'crawled_screenshot'):
|
||||||
|
update_file = os.path.join(os.environ['AIL_HOME'], 'update', 'v1.4', 'Update-ARDB_Onions_screenshots.py')
|
||||||
|
process = subprocess.run(['python' ,update_file])
|
||||||
|
if r_serv.scard('ail:update_v1.5') != 5:
|
||||||
r_serv.set('ail:update_error', 'Update v1.5 Failed, please relaunch the bin/update-background.py script')
|
r_serv.set('ail:update_error', 'Update v1.5 Failed, please relaunch the bin/update-background.py script')
|
||||||
else:
|
else:
|
||||||
r_serv.delete('ail:update_in_progress')
|
r_serv.delete('ail:update_in_progress')
|
||||||
|
|
|
@ -0,0 +1,133 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import redis
|
||||||
|
import datetime
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
from hashlib import sha256
|
||||||
|
|
||||||
|
def rreplace(s, old, new, occurrence):
|
||||||
|
li = s.rsplit(old, occurrence)
|
||||||
|
return new.join(li)
|
||||||
|
|
||||||
|
def substract_date(date_from, date_to):
|
||||||
|
date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8]))
|
||||||
|
date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8]))
|
||||||
|
delta = date_to - date_from # timedelta
|
||||||
|
l_date = []
|
||||||
|
for i in range(delta.days + 1):
|
||||||
|
date = date_from + datetime.timedelta(i)
|
||||||
|
l_date.append( date.strftime('%Y%m%d') )
|
||||||
|
return l_date
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
start_deb = time.time()
|
||||||
|
|
||||||
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
|
if not os.path.exists(configfile):
|
||||||
|
raise Exception('Unable to find the configuration file. \
|
||||||
|
Did you set environment variables? \
|
||||||
|
Or activate the virtualenv.')
|
||||||
|
cfg = configparser.ConfigParser()
|
||||||
|
cfg.read(configfile)
|
||||||
|
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||||
|
NEW_SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"), 'screenshot')
|
||||||
|
|
||||||
|
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
|
||||||
|
|
||||||
|
r_serv = redis.StrictRedis(
|
||||||
|
host=cfg.get("ARDB_DB", "host"),
|
||||||
|
port=cfg.getint("ARDB_DB", "port"),
|
||||||
|
db=cfg.getint("ARDB_DB", "db"),
|
||||||
|
decode_responses=True)
|
||||||
|
|
||||||
|
r_serv_metadata = redis.StrictRedis(
|
||||||
|
host=cfg.get("ARDB_Metadata", "host"),
|
||||||
|
port=cfg.getint("ARDB_Metadata", "port"),
|
||||||
|
db=cfg.getint("ARDB_Metadata", "db"),
|
||||||
|
decode_responses=True)
|
||||||
|
|
||||||
|
r_serv_tag = redis.StrictRedis(
|
||||||
|
host=cfg.get("ARDB_Tags", "host"),
|
||||||
|
port=cfg.getint("ARDB_Tags", "port"),
|
||||||
|
db=cfg.getint("ARDB_Tags", "db"),
|
||||||
|
decode_responses=True)
|
||||||
|
|
||||||
|
r_serv_onion = redis.StrictRedis(
|
||||||
|
host=cfg.get("ARDB_Onion", "host"),
|
||||||
|
port=cfg.getint("ARDB_Onion", "port"),
|
||||||
|
db=cfg.getint("ARDB_Onion", "db"),
|
||||||
|
decode_responses=True)
|
||||||
|
|
||||||
|
r_serv.set('ail:current_background_script', 'crawled_screenshot')
|
||||||
|
r_serv.set('ail:current_background_script_stat', 0)
|
||||||
|
|
||||||
|
## Update Onion ##
|
||||||
|
print('Updating ARDB_Onion ...')
|
||||||
|
index = 0
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
# clean down domain from db
|
||||||
|
date_from = '20180801'
|
||||||
|
date_today = datetime.date.today().strftime("%Y%m%d")
|
||||||
|
list_date = substract_date(date_from, date_today)
|
||||||
|
nb_done = 0
|
||||||
|
last_progress = 0
|
||||||
|
total_to_update = len(list_date)
|
||||||
|
for date in list_date:
|
||||||
|
screenshot_dir = os.path.join(SCREENSHOT_FOLDER, date[0:4], date[4:6], date[6:8])
|
||||||
|
if os.path.isdir(screenshot_dir):
|
||||||
|
print(screenshot_dir)
|
||||||
|
for file in os.listdir(screenshot_dir):
|
||||||
|
if file.endswith(".png"):
|
||||||
|
index += 1
|
||||||
|
#print(file)
|
||||||
|
|
||||||
|
img_path = os.path.join(screenshot_dir, file)
|
||||||
|
with open(img_path, 'br') as f:
|
||||||
|
image_content = f.read()
|
||||||
|
|
||||||
|
hash = sha256(image_content).hexdigest()
|
||||||
|
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
|
||||||
|
filename_img = os.path.join(NEW_SCREENSHOT_FOLDER, img_dir_path, hash[12:] +'.png')
|
||||||
|
dirname = os.path.dirname(filename_img)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
if not os.path.exists(filename_img):
|
||||||
|
os.rename(img_path, filename_img)
|
||||||
|
|
||||||
|
item = os.path.join('crawled', date[0:4], date[4:6], date[6:8], file[:-4])
|
||||||
|
# add item metadata
|
||||||
|
r_serv_metadata.hset('paste_metadata:{}'.format(item), 'screenshot', hash)
|
||||||
|
# add sha256 metadata
|
||||||
|
r_serv_onion.sadd('screenshot:{}'.format(hash), item)
|
||||||
|
|
||||||
|
if file.endswith('.pnghar.txt'):
|
||||||
|
har_path = os.path.join(screenshot_dir, file)
|
||||||
|
new_file = rreplace(file, '.pnghar.txt', '.json', 1)
|
||||||
|
new_har_path = os.path.join(screenshot_dir, new_file)
|
||||||
|
os.rename(har_path, new_har_path)
|
||||||
|
|
||||||
|
progress = int((nb_done * 100) /total_to_update)
|
||||||
|
# update progress stats
|
||||||
|
if progress != last_progress:
|
||||||
|
r_serv.set('ail:current_background_script_stat', progress)
|
||||||
|
print('{}/{} screenshot updated {}%'.format(nb_done, total_to_update, progress))
|
||||||
|
last_progress = progress
|
||||||
|
|
||||||
|
nb_done += 1
|
||||||
|
|
||||||
|
r_serv.set('ail:current_background_script_stat', 100)
|
||||||
|
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print('Updating ARDB_Onion Done => {} paths: {} s'.format(index, end - start))
|
||||||
|
print()
|
||||||
|
print('Done in {} s'.format(end - start_deb))
|
||||||
|
|
||||||
|
r_serv.sadd('ail:update_v1.5', 'crawled_screenshot')
|
|
@ -160,7 +160,7 @@ DiffMaxLineLength = int(cfg.get("Flask", "DiffMaxLineLength"))#Use to display t
|
||||||
|
|
||||||
bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
|
bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
|
||||||
|
|
||||||
dict_update_description = {'v1.5':{'nb_background_update': 4, 'update_warning_message': 'An Update is running on the background. Some informations like Tags, screenshot can be',
|
dict_update_description = {'v1.5':{'nb_background_update': 5, 'update_warning_message': 'An Update is running on the background. Some informations like Tags, screenshot can be',
|
||||||
'update_warning_message_notice_me': 'missing from the UI.'}
|
'update_warning_message_notice_me': 'missing from the UI.'}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -658,7 +658,10 @@ def show_domain():
|
||||||
unpack_url = faup.get()
|
unpack_url = faup.get()
|
||||||
domain = unpack_url['domain'].decode()
|
domain = unpack_url['domain'].decode()
|
||||||
if not port:
|
if not port:
|
||||||
|
if unpack_url['port']:
|
||||||
port = unpack_url['port'].decode()
|
port = unpack_url['port'].decode()
|
||||||
|
else:
|
||||||
|
port = 80
|
||||||
try:
|
try:
|
||||||
port = int(port)
|
port = int(port)
|
||||||
except:
|
except:
|
||||||
|
|
Loading…
Reference in New Issue