AIL-framework/bin/modules/Global.py

234 lines
8.1 KiB
Python
Raw Normal View History

2018-05-04 13:53:29 +02:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The ZMQ_Feed_Q Module
=====================
This module is consuming the Redis-list created by the ZMQ_Feed_Q Module,
And save the item on disk to allow others modules to work on them.
..todo:: Be able to choose to delete or not the saved item after processing.
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
the same Subscriber name in both of them.
Requirements
------------
*Need running Redis instances.
*Need the Mixer or the Importer Module running to be able to work properly.
"""
2021-04-28 15:24:33 +02:00
##################################
# Import External packages
##################################
import base64
import hashlib
import io
import gzip
import os
import sys
import time
import datetime
from hashlib import md5
from uuid import uuid4
2021-04-28 15:24:33 +02:00
sys.path.append(os.environ['AIL_BIN'])
2021-04-28 15:24:33 +02:00
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader
from lib.data_retention_engine import update_obj_date
from lib import item_basic
# from lib import Statistics
2021-04-28 15:24:33 +02:00
class Global(AbstractModule):
"""
Global module for AIL framework
"""
2021-04-28 15:24:33 +02:00
def __init__(self):
super(Global, self).__init__()
self.processed_item = 0
self.time_last_stats = time.time()
# Get and sanitize ITEM DIRECTORY
# # TODO: rename PASTE => ITEM
2021-04-28 15:24:33 +02:00
self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], self.process.config.get("Directories", "pastes"))
self.PASTES_FOLDERS = self.PASTES_FOLDER + '/'
self.PASTES_FOLDERS = os.path.join(os.path.realpath(self.PASTES_FOLDERS), '')
2018-04-20 10:42:19 +02:00
# Waiting time in seconds between to message processed
2021-04-28 15:24:33 +02:00
self.pending_seconds = 0.5
2018-04-20 10:42:19 +02:00
2021-04-28 15:24:33 +02:00
# Send module state to logs
self.redis_logger.info(f"Module {self.module_name} initialized")
2021-04-28 15:24:33 +02:00
def computeNone(self):
difftime = time.time() - self.time_last_stats
2021-04-28 15:24:33 +02:00
if int(difftime) > 30:
to_print = f'Global; ; ; ;glob Processed {self.processed_item} item(s) in {difftime} s'
print(to_print)
2021-04-28 15:24:33 +02:00
self.redis_logger.debug(to_print)
self.time_last_stats = time.time()
self.processed_item = 0
2021-04-28 15:24:33 +02:00
def compute(self, message, r_result=False):
2021-04-28 15:24:33 +02:00
# Recovering the streamed message informations
splitted = message.split()
2021-04-28 15:24:33 +02:00
if len(splitted) == 2:
item, gzip64encoded = splitted
2018-11-02 16:07:27 +01:00
# Remove PASTES_FOLDER from item path (crawled item + submitted)
if self.PASTES_FOLDERS in item:
item = item.replace(self.PASTES_FOLDERS, '', 1)
file_name_item = item.split('/')[-1]
if len(file_name_item) > 255:
new_file_name_item = '{}{}.gz'.format(file_name_item[:215], str(uuid4()))
item = self.rreplace(item, file_name_item, new_file_name_item, 1)
2021-04-28 15:24:33 +02:00
# Creating the full filepath
filename = os.path.join(self.PASTES_FOLDER, item)
2021-04-28 15:24:33 +02:00
filename = os.path.realpath(filename)
# Incorrect filename
if not os.path.commonprefix([filename, self.PASTES_FOLDER]) == self.PASTES_FOLDER:
self.redis_logger.warning(f'Global; Path traversal detected {filename}')
print(f'Global; Path traversal detected {filename}')
else:
2021-04-28 15:24:33 +02:00
# Decode compressed base64
decoded = base64.standard_b64decode(gzip64encoded)
new_file_content = self.gunzip_bytes_obj(filename, decoded)
2021-04-28 15:24:33 +02:00
if new_file_content:
filename = self.check_filename(filename, new_file_content)
if filename:
# create subdir
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename, 'wb') as f:
f.write(decoded)
item_id = filename
2021-04-28 15:24:33 +02:00
# remove self.PASTES_FOLDER from
if self.PASTES_FOLDERS in item_id:
item_id = item_id.replace(self.PASTES_FOLDERS, '', 1)
2021-04-28 15:24:33 +02:00
update_obj_date(item_basic.get_item_date(item_id), 'item')
self.send_message_to_queue(item_id)
self.processed_item += 1
print(item_id)
if r_result:
return item_id
2021-04-28 15:24:33 +02:00
else:
self.redis_logger.debug(f"Empty Item: {message} not processed")
print(f"Empty Item: {message} not processed")
2021-04-28 15:24:33 +02:00
def check_filename(self, filename, new_file_content):
"""
Check if file is not a duplicated file
return the filename if new file, else None
"""
# check if file exist
if os.path.isfile(filename):
self.redis_logger.warning(f'File already exist {filename}')
print(f'File already exist {filename}')
2021-04-28 15:24:33 +02:00
# Check that file already exists but content differs
curr_file_content = self.gunzip_file(filename)
if curr_file_content:
# Compare file content with message content with MD5 checksums
curr_file_md5 = md5(curr_file_content).hexdigest()
new_file_md5 = md5(new_file_content).hexdigest()
if new_file_md5 != curr_file_md5:
2021-04-28 15:24:33 +02:00
# MD5 are not equals, verify filename
if filename.endswith('.gz'):
2021-04-28 15:24:33 +02:00
filename = f'{filename[:-3]}_{new_file_md5}.gz'
else:
2021-04-28 15:24:33 +02:00
filename = f'{filename}_{new_file_md5}'
self.redis_logger.debug(f'new file to check: {filename}')
if os.path.isfile(filename):
2021-04-28 15:24:33 +02:00
# Ignore duplicate
self.redis_logger.debug(f'ignore duplicated file {filename}')
print(f'ignore duplicated file {filename}')
2021-04-28 15:24:33 +02:00
filename = None
else:
2021-04-28 15:24:33 +02:00
# Ignore duplicate checksum equals
self.redis_logger.debug(f'ignore duplicated file {filename}')
print(f'ignore duplicated file {filename}')
2021-04-28 15:24:33 +02:00
filename = None
2021-04-28 15:24:33 +02:00
else:
# File not unzipped
filename = None
2021-04-28 15:24:33 +02:00
return filename
def gunzip_file(self, filename):
"""
Unzip a file
publish stats if failure
"""
curr_file_content = None
2021-04-28 15:24:33 +02:00
try:
with gzip.open(filename, 'rb') as f:
curr_file_content = f.read()
except EOFError:
self.redis_logger.warning(f'Global; Incomplete file: {filename}')
print(f'Global; Incomplete file: {filename}')
2021-04-28 15:24:33 +02:00
# save daily stats
# self.r_stats.zincrby('module:Global:incomplete_file', 1, datetime.datetime.now().strftime('%Y%m%d'))
# Statistics.
2021-04-28 15:24:33 +02:00
except OSError:
self.redis_logger.warning(f'Global; Not a gzipped file: {filename}')
print(f'Global; Not a gzipped file: {filename}')
2021-04-28 15:24:33 +02:00
# save daily stats
# self.r_stats.zincrby('module:Global:invalid_file', 1, datetime.datetime.now().strftime('%Y%m%d'))
2018-04-20 10:42:19 +02:00
2021-04-28 15:24:33 +02:00
return curr_file_content
2018-11-02 16:07:27 +01:00
2022-08-19 16:53:31 +02:00
# # TODO: add stats incomplete_file/Not a gzipped file
def gunzip_bytes_obj(self, filename, bytes_obj):
2021-04-28 15:24:33 +02:00
gunzipped_bytes_obj = None
try:
in_ = io.BytesIO()
in_.write(bytes_obj)
in_.seek(0)
2021-04-28 15:24:33 +02:00
with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
gunzipped_bytes_obj = fo.read()
except Exception as e:
self.redis_logger.warning(f'Global; Invalid Gzip file: {filename}, {e}')
print(f'Global; Invalid Gzip file: {filename}, {e}')
2021-04-28 15:24:33 +02:00
return gunzipped_bytes_obj
def rreplace(self, s, old, new, occurrence):
li = s.rsplit(old, occurrence)
return new.join(li)
if __name__ == '__main__':
2021-04-28 15:24:33 +02:00
module = Global()
module.run()