2018-05-04 13:53:29 +02:00
|
|
|
#!/usr/bin/env python3
|
2016-02-04 15:22:51 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import zmq
|
|
|
|
import base64
|
2018-04-16 14:50:04 +02:00
|
|
|
from io import StringIO
|
2018-09-19 13:38:31 +02:00
|
|
|
import datetime
|
2017-08-22 15:17:49 +02:00
|
|
|
import gzip
|
2016-02-04 15:22:51 +01:00
|
|
|
import argparse
|
2020-02-27 14:21:32 +01:00
|
|
|
import binascii
|
2016-02-04 15:22:51 +01:00
|
|
|
import os
|
2017-08-22 15:17:49 +02:00
|
|
|
import time, datetime
|
2020-02-25 14:41:45 +01:00
|
|
|
import re
|
2017-08-22 15:17:49 +02:00
|
|
|
|
|
|
|
'''
|
|
|
|
'
|
|
|
|
' Import content/pastes into redis.
|
2018-09-19 13:38:31 +02:00
|
|
|
' If content is not compressed yet, compress it (only text).
|
2017-08-22 15:17:49 +02:00
|
|
|
'
|
|
|
|
' /!\ WARNING /!\
|
2018-09-19 13:38:31 +02:00
|
|
|
Content to be imported can be placed in a directory tree of the form
|
2017-08-22 15:17:49 +02:00
|
|
|
root/
|
|
|
|
|
|
|
|
|
+-- Year/
|
|
|
|
|
|
|
|
|
+-- Month/
|
|
|
|
|
|
|
|
|
+-- Day/
|
|
|
|
|
|
|
|
|
+-- Content
|
|
|
|
e.g.:
|
|
|
|
~/to_import/2017/08/22/paste1.gz
|
2018-09-19 13:38:31 +02:00
|
|
|
|
|
|
|
or this directory tree will be created with the current date
|
|
|
|
e.g.:
|
|
|
|
~/to_import/paste1.gz
|
2017-08-22 15:17:49 +02:00
|
|
|
'
|
|
|
|
'''
|
|
|
|
|
2020-02-27 14:21:32 +01:00
|
|
|
def is_gzip_file(magic_nuber):
|
|
|
|
return binascii.hexlify(magic_nuber) == b'1f8b'
|
2018-04-16 14:50:04 +02:00
|
|
|
|
2017-08-22 15:17:49 +02:00
|
|
|
def is_hierachy_valid(path):
|
|
|
|
var = path.split('/')
|
|
|
|
try:
|
|
|
|
newDate = datetime.datetime(int(var[-4]), int(var[-3]), int(var[-2]))
|
|
|
|
correctDate = True
|
|
|
|
except ValueError:
|
|
|
|
correctDate = False
|
|
|
|
except IndexError:
|
|
|
|
correctDate = False
|
|
|
|
except:
|
|
|
|
correctDate = False
|
|
|
|
return correctDate
|
|
|
|
|
2020-02-25 14:41:45 +01:00
|
|
|
def sanitize_str(str_var, invalid_char_regex):
|
|
|
|
res = re.sub(invalid_char_regex, "-", str_var)
|
|
|
|
return res.replace(' ', '_')
|
2016-02-04 15:22:51 +01:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(description='Take files from a directory and push them into a 0MQ feed.')
|
|
|
|
parser.add_argument('-d', '--directory', type=str, required=True, help='Root directory to import')
|
|
|
|
parser.add_argument('-p', '--port', type=int, default=5556, help='Zero MQ port')
|
|
|
|
parser.add_argument('-c', '--channel', type=str, default='102', help='Zero MQ channel')
|
2017-08-22 15:17:49 +02:00
|
|
|
parser.add_argument('-n', '--name', type=str, default='import_dir', help='Name of the feeder')
|
2017-11-15 16:03:42 +01:00
|
|
|
parser.add_argument('-s', '--seconds', type=float, default=0.2, help='Second between pastes')
|
2017-08-22 15:17:49 +02:00
|
|
|
parser.add_argument('--hierarchy', type=int, default=1, help='Number of parent directory forming the name')
|
2016-02-04 15:22:51 +01:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
context = zmq.Context()
|
|
|
|
socket = context.socket(zmq.PUB)
|
|
|
|
socket.bind("tcp://*:{}".format(args.port))
|
2017-07-17 16:14:27 +02:00
|
|
|
time.sleep(1) #Important, avoid loosing the 1 message
|
2016-02-04 15:22:51 +01:00
|
|
|
|
2020-02-25 14:41:45 +01:00
|
|
|
invalid_char = r'[\\/*?&%=:"<>|#\\\']'
|
|
|
|
invalid_char_dir = r'[\\*?&%=:"<>|#\\\']'
|
|
|
|
|
2016-02-04 15:22:51 +01:00
|
|
|
for dirname, dirnames, filenames in os.walk(args.directory):
|
|
|
|
for filename in filenames:
|
2017-08-22 15:17:49 +02:00
|
|
|
complete_path = os.path.join(dirname, filename)
|
|
|
|
|
2018-09-19 13:38:31 +02:00
|
|
|
with open(complete_path, 'rb') as f:
|
2018-04-16 14:50:04 +02:00
|
|
|
messagedata = f.read()
|
|
|
|
|
2017-08-22 15:17:49 +02:00
|
|
|
#verify that the data is gzipEncoded. if not compress it
|
2020-02-27 14:21:32 +01:00
|
|
|
if not is_gzip_file(messagedata[0:2]):
|
2018-09-19 13:38:31 +02:00
|
|
|
messagedata = gzip.compress(messagedata)
|
|
|
|
complete_path += '.gz'
|
|
|
|
|
|
|
|
if complete_path[-4:] != '.gz':
|
|
|
|
|
|
|
|
#if paste do not have a 'date hierarchy', create it
|
|
|
|
if not is_hierachy_valid(complete_path):
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
paste_name = complete_path.split('/')[-1]
|
2020-02-25 14:41:45 +01:00
|
|
|
paste_name = sanitize_str(paste_name, invalid_char)
|
2018-09-19 13:38:31 +02:00
|
|
|
directory = complete_path.split('/')[-2]
|
2020-02-25 14:41:45 +01:00
|
|
|
directory = sanitize_str(directory, invalid_char_dir)
|
2018-09-19 13:38:31 +02:00
|
|
|
wanted_path = os.path.join(directory, now.strftime("%Y"), now.strftime("%m"), now.strftime("%d"), paste_name)
|
2020-02-25 14:41:45 +01:00
|
|
|
wanted_path = os.path.relpath(wanted_path)
|
2018-09-19 13:38:31 +02:00
|
|
|
else:
|
|
|
|
#take wanted path of the file
|
2020-02-25 14:41:45 +01:00
|
|
|
wanted_path = os.path.relpath(complete_path)
|
2018-09-19 13:38:31 +02:00
|
|
|
wanted_path = wanted_path.split('/')
|
|
|
|
wanted_path = '/'.join(wanted_path[-(4+args.hierarchy):])
|
2020-02-25 14:41:45 +01:00
|
|
|
wanted_path = sanitize_str(wanted_path, invalid_char_dir)
|
2018-09-19 13:38:31 +02:00
|
|
|
|
2020-02-25 14:41:45 +01:00
|
|
|
# sanitize feeder_name
|
|
|
|
feeder_name = os.path.relpath(sanitize_str(args.name, invalid_char))
|
2020-02-25 11:34:55 +01:00
|
|
|
|
|
|
|
path_to_send = 'import_dir/' + feeder_name + '>>' + wanted_path
|
2018-09-19 13:38:31 +02:00
|
|
|
s = b' '.join( [ args.channel.encode(), path_to_send.encode(), base64.b64encode(messagedata) ] )
|
|
|
|
socket.send(s)
|
2020-02-25 11:34:55 +01:00
|
|
|
print('import_dir/' + feeder_name+'>>'+wanted_path)
|
2018-09-19 13:38:31 +02:00
|
|
|
time.sleep(args.seconds)
|
|
|
|
|
|
|
|
else:
|
|
|
|
print('{} : incorrect type'.format(complete_path))
|