2018-05-04 13:53:29 +02:00
|
|
|
#!/usr/bin/env python3
|
2014-08-06 11:43:40 +02:00
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
|
|
|
The ZMQ_Feed_Q Module
|
|
|
|
=====================
|
|
|
|
|
|
|
|
This module is consuming the Redis-list created by the ZMQ_Feed_Q Module,
|
|
|
|
And save the paste on disk to allow others modules to work on them.
|
|
|
|
|
|
|
|
..todo:: Be able to choose to delete or not the saved paste after processing.
|
|
|
|
..todo:: Store the empty paste (unprocessed) somewhere in Redis.
|
|
|
|
|
|
|
|
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
|
|
|
the same Subscriber name in both of them.
|
|
|
|
|
|
|
|
Requirements
|
|
|
|
------------
|
|
|
|
|
|
|
|
*Need running Redis instances.
|
|
|
|
*Need the ZMQ_Feed_Q Module running to be able to work properly.
|
|
|
|
|
|
|
|
"""
|
2014-08-14 14:11:07 +02:00
|
|
|
import base64
|
2020-02-03 14:51:51 +01:00
|
|
|
import hashlib
|
|
|
|
import io
|
|
|
|
import gzip
|
2014-08-14 14:11:07 +02:00
|
|
|
import os
|
2020-02-03 14:51:51 +01:00
|
|
|
import sys
|
2014-08-14 14:11:07 +02:00
|
|
|
import time
|
2019-02-12 15:45:58 +01:00
|
|
|
import uuid
|
2020-02-07 10:53:45 +01:00
|
|
|
|
|
|
|
import datetime
|
|
|
|
import redis
|
|
|
|
|
|
|
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
|
|
|
import ConfigLoader
|
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
from pubsublogger import publisher
|
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
from Helper import Process
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2020-02-07 10:53:45 +01:00
|
|
|
config_loader = ConfigLoader.ConfigLoader()
|
|
|
|
r_stats = config_loader.get_redis_conn("ARDB_Statistics")
|
|
|
|
config_loader = None
|
|
|
|
|
2020-02-03 14:51:51 +01:00
|
|
|
def gunzip_bytes_obj(bytes_obj):
|
|
|
|
in_ = io.BytesIO()
|
|
|
|
in_.write(bytes_obj)
|
|
|
|
in_.seek(0)
|
|
|
|
with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
|
|
|
|
gunzipped_bytes_obj = fo.read()
|
|
|
|
return gunzipped_bytes_obj
|
2018-04-20 10:42:19 +02:00
|
|
|
|
2019-02-12 15:45:58 +01:00
|
|
|
def rreplace(s, old, new, occurrence):
|
|
|
|
li = s.rsplit(old, occurrence)
|
|
|
|
return new.join(li)
|
2018-04-20 10:42:19 +02:00
|
|
|
|
2014-08-14 14:11:07 +02:00
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
if __name__ == '__main__':
|
2014-08-22 17:35:40 +02:00
|
|
|
publisher.port = 6380
|
2014-08-29 19:37:56 +02:00
|
|
|
publisher.channel = 'Script'
|
2016-07-13 15:57:33 +02:00
|
|
|
processed_paste = 0
|
|
|
|
time_1 = time.time()
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
config_section = 'Global'
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
p = Process(config_section)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2020-02-03 10:32:20 +01:00
|
|
|
# get and sanityze PASTE DIRECTORY
|
2018-11-02 16:07:27 +01:00
|
|
|
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
2019-04-12 15:13:36 +02:00
|
|
|
PASTES_FOLDERS = PASTES_FOLDER + '/'
|
2020-02-03 10:32:20 +01:00
|
|
|
PASTES_FOLDERS = os.path.join(os.path.realpath(PASTES_FOLDERS), '')
|
2018-11-02 16:07:27 +01:00
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
# LOGGING #
|
|
|
|
publisher.info("Feed Script started to receive & publish.")
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
message = p.get_from_set()
|
2014-08-14 14:11:07 +02:00
|
|
|
# Recovering the streamed message informations.
|
|
|
|
if message is not None:
|
2014-08-29 19:37:56 +02:00
|
|
|
splitted = message.split()
|
|
|
|
if len(splitted) == 2:
|
|
|
|
paste, gzip64encoded = splitted
|
2014-08-06 11:43:40 +02:00
|
|
|
else:
|
2014-08-14 14:11:07 +02:00
|
|
|
# TODO Store the name of the empty paste inside a Redis-list.
|
2018-04-12 17:06:57 +02:00
|
|
|
print("Empty Paste: not processed")
|
2016-07-25 16:38:57 +02:00
|
|
|
publisher.debug("Empty Paste: {0} not processed".format(message))
|
2014-08-06 11:43:40 +02:00
|
|
|
continue
|
|
|
|
else:
|
2020-02-03 14:51:51 +01:00
|
|
|
#print("Empty Queues: Waiting...")
|
2016-07-13 15:57:33 +02:00
|
|
|
if int(time.time() - time_1) > 30:
|
2020-02-03 14:51:51 +01:00
|
|
|
to_print = 'Global; ; ; ;glob Processed {0} paste(s) in {1} s'.format(processed_paste, time.time() - time_1)
|
2018-04-12 17:06:57 +02:00
|
|
|
print(to_print)
|
2016-12-23 15:44:46 +01:00
|
|
|
#publisher.info(to_print)
|
2016-07-13 15:57:33 +02:00
|
|
|
time_1 = time.time()
|
|
|
|
processed_paste = 0
|
2020-02-27 13:23:40 +01:00
|
|
|
time.sleep(0.5)
|
2014-08-06 11:43:40 +02:00
|
|
|
continue
|
2019-02-12 15:45:58 +01:00
|
|
|
|
2020-02-03 10:32:20 +01:00
|
|
|
# remove PASTES_FOLDER from item path (crawled item + submited)
|
|
|
|
if PASTES_FOLDERS in paste:
|
|
|
|
paste = paste.replace(PASTES_FOLDERS, '', 1)
|
|
|
|
|
2019-02-12 15:45:58 +01:00
|
|
|
file_name_paste = paste.split('/')[-1]
|
|
|
|
if len(file_name_paste)>255:
|
|
|
|
new_file_name_paste = '{}{}.gz'.format(file_name_paste[:215], str(uuid.uuid4()))
|
|
|
|
paste = rreplace(paste, file_name_paste, new_file_name_paste, 1)
|
|
|
|
|
2014-08-14 14:11:07 +02:00
|
|
|
# Creating the full filepath
|
2018-11-02 16:07:27 +01:00
|
|
|
filename = os.path.join(PASTES_FOLDER, paste)
|
2020-02-03 10:32:20 +01:00
|
|
|
filename = os.path.realpath(filename)
|
2018-05-02 17:07:10 +02:00
|
|
|
|
2020-02-03 10:32:20 +01:00
|
|
|
# incorrect filename
|
|
|
|
if not os.path.commonprefix([filename, PASTES_FOLDER]) == PASTES_FOLDER:
|
|
|
|
print('Path traversal detected {}'.format(filename))
|
|
|
|
publisher.warning('Global; Path traversal detected')
|
|
|
|
else:
|
2020-02-03 14:51:51 +01:00
|
|
|
|
|
|
|
# decode compressed base64
|
|
|
|
decoded = base64.standard_b64decode(gzip64encoded)
|
|
|
|
|
|
|
|
# check if file exist
|
|
|
|
if os.path.isfile(filename):
|
|
|
|
print('File already exist {}'.format(filename))
|
|
|
|
publisher.warning('Global; File already exist')
|
|
|
|
|
2020-02-06 09:41:43 +01:00
|
|
|
try:
|
|
|
|
with gzip.open(filename, 'rb') as f:
|
|
|
|
curr_file_content = f.read()
|
|
|
|
except EOFError:
|
|
|
|
publisher.warning('Global; Incomplete file: {}'.format(filename))
|
2020-02-07 10:53:45 +01:00
|
|
|
# save daily stats
|
|
|
|
r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
|
|
|
|
# discard item
|
|
|
|
continue
|
|
|
|
except OSError:
|
|
|
|
publisher.warning('Global; Not a gzipped file: {}'.format(filename))
|
|
|
|
# save daily stats
|
|
|
|
r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
|
2020-02-06 09:41:43 +01:00
|
|
|
# discard item
|
|
|
|
continue
|
2020-02-03 14:51:51 +01:00
|
|
|
|
|
|
|
curr_file_md5 = hashlib.md5(curr_file_content).hexdigest()
|
|
|
|
|
|
|
|
new_file_content = gunzip_bytes_obj(decoded)
|
|
|
|
new_file_md5 = hashlib.md5(new_file_content).hexdigest()
|
|
|
|
|
|
|
|
if new_file_md5 != curr_file_md5:
|
|
|
|
|
|
|
|
if filename.endswith('.gz'):
|
|
|
|
filename = '{}_{}.gz'.format(filename[:-3], new_file_md5)
|
|
|
|
else:
|
|
|
|
filename = '{}_{}'.format(filename, new_file_md5)
|
|
|
|
|
|
|
|
# continue if new file already exist
|
|
|
|
if os.path.isfile(filename):
|
|
|
|
print('ignore duplicated file')
|
|
|
|
continue
|
|
|
|
|
|
|
|
print('new file: {}'.format(filename))
|
|
|
|
# ignore duplicate
|
|
|
|
else:
|
|
|
|
print('ignore duplicated file')
|
|
|
|
continue
|
|
|
|
|
|
|
|
# create subdir
|
2020-02-03 10:32:20 +01:00
|
|
|
dirname = os.path.dirname(filename)
|
|
|
|
if not os.path.exists(dirname):
|
|
|
|
os.makedirs(dirname)
|
2018-04-20 10:42:19 +02:00
|
|
|
|
2020-02-03 10:32:20 +01:00
|
|
|
with open(filename, 'wb') as f:
|
|
|
|
f.write(decoded)
|
2018-11-02 16:07:27 +01:00
|
|
|
|
2020-02-03 14:51:51 +01:00
|
|
|
paste = filename
|
|
|
|
# remove PASTES_FOLDER from
|
|
|
|
if PASTES_FOLDERS in paste:
|
|
|
|
paste = paste.replace(PASTES_FOLDERS, '', 1)
|
2019-04-12 15:13:36 +02:00
|
|
|
|
2020-02-03 10:32:20 +01:00
|
|
|
p.populate_set_out(paste)
|
|
|
|
processed_paste+=1
|