mirror of https://github.com/CIRCL/PyCIRCLean
Move run_process back to Groomer object
parent
781d0a76af
commit
8d7dd1197f
|
@ -66,8 +66,8 @@ install:
|
||||||
- rm fraunhoferlibrary.zip
|
- rm fraunhoferlibrary.zip
|
||||||
- 7z x -p42 42.zip
|
- 7z x -p42 42.zip
|
||||||
# Some random samples
|
# Some random samples
|
||||||
- wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3
|
# - wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3
|
||||||
- wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4
|
# - wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4
|
||||||
- wget http://thewalter.net/stef/software/rtfx/sample.rtf
|
- wget http://thewalter.net/stef/software/rtfx/sample.rtf
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
|
|
|
@ -182,7 +182,7 @@ class File(FileBase):
|
||||||
|
|
||||||
def write_log(self):
|
def write_log(self):
|
||||||
"""Print the logs related to the current file being processed."""
|
"""Print the logs related to the current file being processed."""
|
||||||
# TODO: move to helpers?
|
# TODO: move to helpers.py?
|
||||||
tmp_log = self.logger.log.fields(**self.log_details)
|
tmp_log = self.logger.log.fields(**self.log_details)
|
||||||
if self.is_dangerous():
|
if self.is_dangerous():
|
||||||
tmp_log.warning(self.log_string)
|
tmp_log.warning(self.log_string)
|
||||||
|
@ -191,23 +191,13 @@ class File(FileBase):
|
||||||
else:
|
else:
|
||||||
tmp_log.debug(self.log_string)
|
tmp_log.debug(self.log_string)
|
||||||
|
|
||||||
# Make this an @property
|
# TODO: Make this an @property
|
||||||
def has_metadata(self):
|
def has_metadata(self):
|
||||||
if self.mimetype in Config.mimes_metadata:
|
if self.mimetype in Config.mimes_metadata:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _run_process(self, command_string, timeout=None):
|
def make_tempdir(self):
|
||||||
"""Run command_string in a subprocess, wait until it finishes."""
|
|
||||||
args = shlex.split(command_string)
|
|
||||||
with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout:
|
|
||||||
try:
|
|
||||||
subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout)
|
|
||||||
except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
|
|
||||||
return
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _make_tempdir(self):
|
|
||||||
"""Make a temporary directory."""
|
"""Make a temporary directory."""
|
||||||
self.tempdir_path = self.dst_path + '_temp'
|
self.tempdir_path = self.dst_path + '_temp'
|
||||||
if not os.path.exists(self.tempdir_path):
|
if not os.path.exists(self.tempdir_path):
|
||||||
|
@ -348,6 +338,7 @@ class File(FileBase):
|
||||||
try:
|
try:
|
||||||
lodoc = zipfile.ZipFile(self.src_path, 'r')
|
lodoc = zipfile.ZipFile(self.src_path, 'r')
|
||||||
except:
|
except:
|
||||||
|
# TODO: are there specific exceptions we should catch here? Or is anything ok
|
||||||
self.add_log_details('invalid', True)
|
self.add_log_details('invalid', True)
|
||||||
self.make_dangerous()
|
self.make_dangerous()
|
||||||
for f in lodoc.infolist():
|
for f in lodoc.infolist():
|
||||||
|
@ -362,7 +353,7 @@ class File(FileBase):
|
||||||
self.add_log_details('processing_type', 'pdf')
|
self.add_log_details('processing_type', 'pdf')
|
||||||
xmlDoc = PDFiD(self.src_path)
|
xmlDoc = PDFiD(self.src_path)
|
||||||
oPDFiD = cPDFiD(xmlDoc, True)
|
oPDFiD = cPDFiD(xmlDoc, True)
|
||||||
# TODO: other keywords?
|
# TODO: are there other characteristics which should be dangerous?
|
||||||
if oPDFiD.encrypt.count > 0:
|
if oPDFiD.encrypt.count > 0:
|
||||||
self.add_log_details('encrypted', True)
|
self.add_log_details('encrypted', True)
|
||||||
self.make_dangerous()
|
self.make_dangerous()
|
||||||
|
@ -399,6 +390,7 @@ class File(FileBase):
|
||||||
#######################
|
#######################
|
||||||
# Metadata extractors
|
# Metadata extractors
|
||||||
def _metadata_exif(self, metadata_file_path):
|
def _metadata_exif(self, metadata_file_path):
|
||||||
|
# TODO: this method is kind of long, can we shorten it?
|
||||||
img = open(self.src_path, 'rb')
|
img = open(self.src_path, 'rb')
|
||||||
tags = None
|
tags = None
|
||||||
|
|
||||||
|
@ -486,7 +478,7 @@ class File(FileBase):
|
||||||
# TODO: make sure this method works for png, gif, tiff
|
# TODO: make sure this method works for png, gif, tiff
|
||||||
if self.has_metadata():
|
if self.has_metadata():
|
||||||
self.extract_metadata()
|
self.extract_metadata()
|
||||||
tempdir_path = self._make_tempdir()
|
tempdir_path = self.make_tempdir()
|
||||||
tempfile_path = os.path.join(tempdir_path, self.filename)
|
tempfile_path = os.path.join(tempdir_path, self.filename)
|
||||||
warnings.simplefilter('error', Image.DecompressionBombWarning)
|
warnings.simplefilter('error', Image.DecompressionBombWarning)
|
||||||
try: # Do image conversions
|
try: # Do image conversions
|
||||||
|
@ -495,6 +487,7 @@ class File(FileBase):
|
||||||
img_out.save(tempfile_path)
|
img_out.save(tempfile_path)
|
||||||
self.src_path = tempfile_path
|
self.src_path = tempfile_path
|
||||||
except Exception as e: # Catch decompression bombs
|
except Exception as e: # Catch decompression bombs
|
||||||
|
# TODO: change this from all Exceptions to specific DecompressionBombWarning
|
||||||
# TODO: change this from printing to logging
|
# TODO: change this from printing to logging
|
||||||
print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path))
|
print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path))
|
||||||
print(e)
|
print(e)
|
||||||
|
@ -512,7 +505,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
|
|
||||||
def process_dir(self, src_dir, dst_dir):
|
def process_dir(self, src_dir, dst_dir):
|
||||||
"""Main function coordinating file processing."""
|
"""Main function coordinating file processing."""
|
||||||
# TODO: Think we want to move write_log elsewhere:
|
# TODO: we probably want to move this write_log elsewhere:
|
||||||
# if self.recursive_archive_depth > 0:
|
# if self.recursive_archive_depth > 0:
|
||||||
# self.write_log()
|
# self.write_log()
|
||||||
# TODO: Can we clean up the way we handle relative_path?
|
# TODO: Can we clean up the way we handle relative_path?
|
||||||
|
@ -532,7 +525,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
if file.is_recursive:
|
if file.is_recursive:
|
||||||
self.process_archive(file)
|
self.process_archive(file)
|
||||||
else:
|
else:
|
||||||
# TODO: Check if should be copied, make an attribute for should be copied True/False
|
# TODO: Make a File attribute for should be copied True/False and check it
|
||||||
self._safe_copy()
|
self._safe_copy()
|
||||||
file.write_log()
|
file.write_log()
|
||||||
if hasattr(file, "tempdir_path"):
|
if hasattr(file, "tempdir_path"):
|
||||||
|
@ -548,33 +541,34 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
if self.recursive_archive_depth >= self.max_recursive_depth:
|
if self.recursive_archive_depth >= self.max_recursive_depth:
|
||||||
self._handle_archivebomb(file)
|
self._handle_archivebomb(file)
|
||||||
else:
|
else:
|
||||||
tempdir_path = file._make_tempdir()
|
tempdir_path = file.make_tempdir()
|
||||||
# Unpack the archive
|
command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
|
||||||
base_command = '{} -p1 x "{}" -o"{}" -bd -aoa'
|
unpack_command = command_str.format(SEVENZ_PATH,
|
||||||
extract_command = base_command.format(SEVENZ_PATH, file.src_path, tempdir_path)
|
file.src_path, tempdir_path)
|
||||||
file._run_process(extract_command)
|
self._run_process(unpack_command)
|
||||||
# Add it to the tree
|
# TODO: check that tree is working correctly here
|
||||||
self.logger.tree(tempdir_path)
|
self.logger.tree(tempdir_path)
|
||||||
# List all files, process them
|
|
||||||
self.process_dir(tempdir_path, file.dst_path)
|
self.process_dir(tempdir_path, file.dst_path)
|
||||||
# Clean up
|
|
||||||
self._safe_rmtree(tempdir_path)
|
self._safe_rmtree(tempdir_path)
|
||||||
self.recursive_archive_depth -= 1
|
self.recursive_archive_depth -= 1
|
||||||
|
|
||||||
|
|
||||||
def _handle_archivebomb(self, file):
|
def _handle_archivebomb(self, file):
|
||||||
file.make_dangerous()
|
file.make_dangerous()
|
||||||
file.add_log_details('Archive Bomb', True)
|
file.add_log_details('Archive Bomb', True)
|
||||||
self.logger.log.warning('ARCHIVE BOMB.')
|
self.logger.log.warning('ARCHIVE BOMB.')
|
||||||
self.logger.log.warning('The content of the archive contains recursively other archives.')
|
self.logger.log.warning('The content of the archive contains recursively other archives.')
|
||||||
self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.')
|
self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.')
|
||||||
# TODO: are we sure we want to delete the archive on the source key? Commenting out for now
|
# TODO: delete whatever we want to delete that's already been copied to dest dir
|
||||||
# self._safe_rmtree(file.src_dir)
|
|
||||||
# What is the goal of this code:
|
def _run_process(self, command_string, timeout=None):
|
||||||
# if file.src_dir.endswith('_temp'):
|
"""Run command_string in a subprocess, wait until it finishes."""
|
||||||
# # TODO: change the way bomb_path is constructed and the way we check for tempdir
|
args = shlex.split(command_string)
|
||||||
# bomb_path = file.src_dir[:-len('_temp')]
|
with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout:
|
||||||
# self._safe_remove(bomb_path)
|
try:
|
||||||
|
subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout)
|
||||||
|
except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
|
||||||
|
return
|
||||||
|
return True
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.process_dir(self.src_root_dir, self.dst_root_dir)
|
self.process_dir(self.src_root_dir, self.dst_root_dir)
|
||||||
|
|
Loading…
Reference in New Issue