2018-05-04 13:53:29 +02:00
|
|
|
#!/usr/bin/env python3
|
2014-08-06 11:43:40 +02:00
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
|
|
|
The ZMQ_Feed_Q Module
|
|
|
|
=====================
|
|
|
|
|
|
|
|
This module is consuming the Redis-list created by the ZMQ_Feed_Q Module,
|
2021-05-27 17:28:20 +02:00
|
|
|
And save the item on disk to allow others modules to work on them.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2021-05-27 17:28:20 +02:00
|
|
|
..todo:: Be able to choose to delete or not the saved item after processing.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
|
|
|
the same Subscriber name in both of them.
|
|
|
|
|
|
|
|
Requirements
|
|
|
|
------------
|
|
|
|
|
|
|
|
*Need running Redis instances.
|
2021-05-27 17:28:20 +02:00
|
|
|
*Need the Mixer or the Importer Module running to be able to work properly.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
"""
|
2021-04-28 15:24:33 +02:00
|
|
|
|
|
|
|
##################################
|
|
|
|
# Import External packages
|
|
|
|
##################################
|
2014-08-14 14:11:07 +02:00
|
|
|
import base64
|
2020-02-03 14:51:51 +01:00
|
|
|
import io
|
|
|
|
import gzip
|
2014-08-14 14:11:07 +02:00
|
|
|
import os
|
2020-02-03 14:51:51 +01:00
|
|
|
import sys
|
2014-08-14 14:11:07 +02:00
|
|
|
import time
|
2020-02-07 10:53:45 +01:00
|
|
|
|
2021-05-27 17:28:20 +02:00
|
|
|
from hashlib import md5
|
|
|
|
from uuid import uuid4
|
2021-04-28 15:24:33 +02:00
|
|
|
|
2021-06-02 14:42:23 +02:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
2021-04-28 15:24:33 +02:00
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
2021-06-02 14:42:23 +02:00
|
|
|
from modules.abstract_module import AbstractModule
|
2023-03-16 15:50:42 +01:00
|
|
|
from lib.ail_core import get_ail_uuid
|
2023-04-13 14:25:02 +02:00
|
|
|
from lib.ConfigLoader import ConfigLoader
|
2022-12-19 16:38:20 +01:00
|
|
|
from lib.data_retention_engine import update_obj_date
|
2023-04-13 14:25:02 +02:00
|
|
|
from lib.objects.Items import Item
|
|
|
|
|
2023-01-18 16:28:08 +01:00
|
|
|
# from lib import Statistics
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
class Global(AbstractModule):
|
|
|
|
"""
|
|
|
|
Global module for AIL framework
|
|
|
|
"""
|
2021-05-27 17:28:20 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
def __init__(self):
|
|
|
|
super(Global, self).__init__()
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2021-05-27 17:28:20 +02:00
|
|
|
self.processed_item = 0
|
|
|
|
self.time_last_stats = time.time()
|
2020-02-07 10:53:45 +01:00
|
|
|
|
2023-04-13 14:25:02 +02:00
|
|
|
config_loader = ConfigLoader()
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# Get and sanitize ITEM DIRECTORY
|
2021-05-27 17:28:20 +02:00
|
|
|
# # TODO: rename PASTE => ITEM
|
2023-04-13 14:25:02 +02:00
|
|
|
self.ITEMS_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
|
|
|
|
self.ITEMS_FOLDER = os.path.join(os.path.realpath(self.ITEMS_FOLDER), '')
|
2018-04-20 10:42:19 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# Waiting time in seconds between to message processed
|
2021-04-28 15:24:33 +02:00
|
|
|
self.pending_seconds = 0.5
|
2018-04-20 10:42:19 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
# Send module state to logs
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.info(f"Module {self.module_name} initialized")
|
|
|
|
# Send module state to logs # TODO MOVE ME IN INIT SCRIPT
|
2023-03-16 15:50:42 +01:00
|
|
|
self.redis_logger.critical(f"AIL {get_ail_uuid()} started")
|
2014-08-14 14:11:07 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
def computeNone(self):
|
2021-05-27 17:28:20 +02:00
|
|
|
difftime = time.time() - self.time_last_stats
|
2021-04-28 15:24:33 +02:00
|
|
|
if int(difftime) > 30:
|
2021-05-27 17:28:20 +02:00
|
|
|
to_print = f'Global; ; ; ;glob Processed {self.processed_item} item(s) in {difftime} s'
|
|
|
|
print(to_print)
|
2021-04-28 15:24:33 +02:00
|
|
|
self.redis_logger.debug(to_print)
|
|
|
|
|
2021-05-27 17:28:20 +02:00
|
|
|
self.time_last_stats = time.time()
|
|
|
|
self.processed_item = 0
|
2021-04-28 15:24:33 +02:00
|
|
|
|
2021-05-27 17:28:20 +02:00
|
|
|
def compute(self, message, r_result=False):
|
2021-04-28 15:24:33 +02:00
|
|
|
# Recovering the streamed message informations
|
|
|
|
splitted = message.split()
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
if len(splitted) == 2:
|
2021-05-27 17:28:20 +02:00
|
|
|
item, gzip64encoded = splitted
|
2018-11-02 16:07:27 +01:00
|
|
|
|
2023-04-13 14:25:02 +02:00
|
|
|
# Remove ITEMS_FOLDER from item path (crawled item + submitted)
|
|
|
|
if self.ITEMS_FOLDER in item:
|
|
|
|
item = item.replace(self.ITEMS_FOLDER, '', 1)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2021-05-27 17:28:20 +02:00
|
|
|
file_name_item = item.split('/')[-1]
|
|
|
|
if len(file_name_item) > 255:
|
|
|
|
new_file_name_item = '{}{}.gz'.format(file_name_item[:215], str(uuid4()))
|
|
|
|
item = self.rreplace(item, file_name_item, new_file_name_item, 1)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
# Creating the full filepath
|
2023-04-13 14:25:02 +02:00
|
|
|
filename = os.path.join(self.ITEMS_FOLDER, item)
|
2021-04-28 15:24:33 +02:00
|
|
|
filename = os.path.realpath(filename)
|
|
|
|
|
|
|
|
# Incorrect filename
|
2023-04-13 14:25:02 +02:00
|
|
|
if not os.path.commonprefix([filename, self.ITEMS_FOLDER]) == self.ITEMS_FOLDER:
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.warning(f'Global; Path traversal detected {filename}')
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f'Global; Path traversal detected {filename}')
|
2021-05-27 17:28:20 +02:00
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
else:
|
2021-04-28 15:24:33 +02:00
|
|
|
# Decode compressed base64
|
|
|
|
decoded = base64.standard_b64decode(gzip64encoded)
|
2021-07-20 11:15:52 +02:00
|
|
|
new_file_content = self.gunzip_bytes_obj(filename, decoded)
|
2021-04-28 15:24:33 +02:00
|
|
|
|
|
|
|
if new_file_content:
|
|
|
|
filename = self.check_filename(filename, new_file_content)
|
|
|
|
|
|
|
|
if filename:
|
|
|
|
# create subdir
|
|
|
|
dirname = os.path.dirname(filename)
|
|
|
|
if not os.path.exists(dirname):
|
|
|
|
os.makedirs(dirname)
|
|
|
|
|
|
|
|
with open(filename, 'wb') as f:
|
|
|
|
f.write(decoded)
|
|
|
|
|
2021-05-27 17:28:20 +02:00
|
|
|
item_id = filename
|
2023-04-13 14:25:02 +02:00
|
|
|
# remove self.ITEMS_FOLDER from
|
|
|
|
if self.ITEMS_FOLDER in item_id:
|
|
|
|
item_id = item_id.replace(self.ITEMS_FOLDER, '', 1)
|
|
|
|
|
|
|
|
item = Item(item_id)
|
2021-04-28 15:24:33 +02:00
|
|
|
|
2023-04-13 14:25:02 +02:00
|
|
|
update_obj_date(item.get_date(), 'item')
|
2022-12-19 16:38:20 +01:00
|
|
|
|
2023-04-13 14:25:02 +02:00
|
|
|
self.add_message_to_queue(item_id, 'Item')
|
2022-10-25 16:25:19 +02:00
|
|
|
self.processed_item += 1
|
2023-04-13 14:25:02 +02:00
|
|
|
|
|
|
|
# DIRTY FIX AIL SYNC - SEND TO SYNC MODULE
|
|
|
|
# # FIXME: DIRTY FIX
|
|
|
|
message = f'{item.get_type()};{item.get_subtype(r_str=True)};{item.get_id()}'
|
|
|
|
print(message)
|
|
|
|
self.add_message_to_queue(message, 'Sync')
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
print(item_id)
|
2021-05-27 17:28:20 +02:00
|
|
|
if r_result:
|
|
|
|
return item_id
|
2021-04-28 15:24:33 +02:00
|
|
|
|
2020-02-03 10:32:20 +01:00
|
|
|
else:
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.debug(f"Empty Item: {message} not processed")
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f"Empty Item: {message} not processed")
|
2020-02-03 14:51:51 +01:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
def check_filename(self, filename, new_file_content):
|
|
|
|
"""
|
|
|
|
Check if file is not a duplicated file
|
|
|
|
return the filename if new file, else None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# check if file exist
|
|
|
|
if os.path.isfile(filename):
|
2023-05-15 09:51:10 +02:00
|
|
|
self.logger.info(f'File already exist {filename}')
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f'File already exist {filename}')
|
2020-02-03 14:51:51 +01:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
# Check that file already exists but content differs
|
|
|
|
curr_file_content = self.gunzip_file(filename)
|
|
|
|
|
|
|
|
if curr_file_content:
|
|
|
|
# Compare file content with message content with MD5 checksums
|
2021-05-27 17:28:20 +02:00
|
|
|
curr_file_md5 = md5(curr_file_content).hexdigest()
|
|
|
|
new_file_md5 = md5(new_file_content).hexdigest()
|
2020-02-03 14:51:51 +01:00
|
|
|
|
|
|
|
if new_file_md5 != curr_file_md5:
|
2021-04-28 15:24:33 +02:00
|
|
|
# MD5 are not equals, verify filename
|
2020-02-03 14:51:51 +01:00
|
|
|
if filename.endswith('.gz'):
|
2021-04-28 15:24:33 +02:00
|
|
|
filename = f'{filename[:-3]}_{new_file_md5}.gz'
|
2020-02-03 14:51:51 +01:00
|
|
|
else:
|
2021-04-28 15:24:33 +02:00
|
|
|
filename = f'{filename}_{new_file_md5}'
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.debug(f'new file to check: {filename}')
|
2020-02-03 14:51:51 +01:00
|
|
|
|
|
|
|
if os.path.isfile(filename):
|
2021-04-28 15:24:33 +02:00
|
|
|
# Ignore duplicate
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.debug(f'ignore duplicated file {filename}')
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f'ignore duplicated file {filename}')
|
2021-04-28 15:24:33 +02:00
|
|
|
filename = None
|
2020-02-03 14:51:51 +01:00
|
|
|
|
|
|
|
else:
|
2021-04-28 15:24:33 +02:00
|
|
|
# Ignore duplicate checksum equals
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.debug(f'ignore duplicated file {filename}')
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f'ignore duplicated file {filename}')
|
2021-04-28 15:24:33 +02:00
|
|
|
filename = None
|
2021-05-27 17:28:20 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
else:
|
|
|
|
# File not unzipped
|
|
|
|
filename = None
|
2021-05-27 17:28:20 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
return filename
|
|
|
|
|
|
|
|
def gunzip_file(self, filename):
|
|
|
|
"""
|
|
|
|
Unzip a file
|
|
|
|
publish stats if failure
|
|
|
|
"""
|
|
|
|
curr_file_content = None
|
2020-02-03 14:51:51 +01:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
try:
|
|
|
|
with gzip.open(filename, 'rb') as f:
|
|
|
|
curr_file_content = f.read()
|
|
|
|
except EOFError:
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.warning(f'Global; Incomplete file: {filename}')
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f'Global; Incomplete file: {filename}')
|
2021-04-28 15:24:33 +02:00
|
|
|
# save daily stats
|
2023-01-18 16:28:08 +01:00
|
|
|
# self.r_stats.zincrby('module:Global:incomplete_file', 1, datetime.datetime.now().strftime('%Y%m%d'))
|
|
|
|
# Statistics.
|
2021-04-28 15:24:33 +02:00
|
|
|
except OSError:
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.warning(f'Global; Not a gzipped file: {filename}')
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f'Global; Not a gzipped file: {filename}')
|
2021-04-28 15:24:33 +02:00
|
|
|
# save daily stats
|
2023-01-18 16:28:08 +01:00
|
|
|
# self.r_stats.zincrby('module:Global:invalid_file', 1, datetime.datetime.now().strftime('%Y%m%d'))
|
2018-04-20 10:42:19 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
return curr_file_content
|
2018-11-02 16:07:27 +01:00
|
|
|
|
2022-08-19 16:53:31 +02:00
|
|
|
# # TODO: add stats incomplete_file/Not a gzipped file
|
2021-07-20 11:15:52 +02:00
|
|
|
def gunzip_bytes_obj(self, filename, bytes_obj):
|
2021-04-28 15:24:33 +02:00
|
|
|
gunzipped_bytes_obj = None
|
|
|
|
try:
|
|
|
|
in_ = io.BytesIO()
|
|
|
|
in_.write(bytes_obj)
|
|
|
|
in_.seek(0)
|
2021-05-27 17:28:20 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
|
|
|
|
gunzipped_bytes_obj = fo.read()
|
|
|
|
except Exception as e:
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.warning(f'Global; Invalid Gzip file: {filename}, {e}')
|
2021-06-08 16:46:36 +02:00
|
|
|
print(f'Global; Invalid Gzip file: {filename}, {e}')
|
2021-04-28 15:24:33 +02:00
|
|
|
|
|
|
|
return gunzipped_bytes_obj
|
|
|
|
|
|
|
|
def rreplace(self, s, old, new, occurrence):
|
|
|
|
li = s.rsplit(old, occurrence)
|
|
|
|
return new.join(li)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2021-05-27 17:28:20 +02:00
|
|
|
|
2021-04-28 15:24:33 +02:00
|
|
|
module = Global()
|
|
|
|
module.run()
|