Move run_process back to Groomer object

pull/12/head
Dan Puttick 2017-03-01 17:43:43 -05:00
parent 781d0a76af
commit 8d7dd1197f
2 changed files with 29 additions and 35 deletions

View File

@ -66,8 +66,8 @@ install:
- rm fraunhoferlibrary.zip - rm fraunhoferlibrary.zip
- 7z x -p42 42.zip - 7z x -p42 42.zip
# Some random samples # Some random samples
- wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3 # - wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3
- wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4 # - wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4
- wget http://thewalter.net/stef/software/rtfx/sample.rtf - wget http://thewalter.net/stef/software/rtfx/sample.rtf
- popd - popd

View File

@ -182,7 +182,7 @@ class File(FileBase):
def write_log(self): def write_log(self):
"""Print the logs related to the current file being processed.""" """Print the logs related to the current file being processed."""
# TODO: move to helpers? # TODO: move to helpers.py?
tmp_log = self.logger.log.fields(**self.log_details) tmp_log = self.logger.log.fields(**self.log_details)
if self.is_dangerous(): if self.is_dangerous():
tmp_log.warning(self.log_string) tmp_log.warning(self.log_string)
@ -191,23 +191,13 @@ class File(FileBase):
else: else:
tmp_log.debug(self.log_string) tmp_log.debug(self.log_string)
# Make this an @property # TODO: Make this an @property
def has_metadata(self): def has_metadata(self):
if self.mimetype in Config.mimes_metadata: if self.mimetype in Config.mimes_metadata:
return True return True
return False return False
def _run_process(self, command_string, timeout=None): def make_tempdir(self):
"""Run command_string in a subprocess, wait until it finishes."""
args = shlex.split(command_string)
with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout:
try:
subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout)
except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
return
return True
def _make_tempdir(self):
"""Make a temporary directory.""" """Make a temporary directory."""
self.tempdir_path = self.dst_path + '_temp' self.tempdir_path = self.dst_path + '_temp'
if not os.path.exists(self.tempdir_path): if not os.path.exists(self.tempdir_path):
@ -348,6 +338,7 @@ class File(FileBase):
try: try:
lodoc = zipfile.ZipFile(self.src_path, 'r') lodoc = zipfile.ZipFile(self.src_path, 'r')
except: except:
# TODO: are there specific exceptions we should catch here? Or is anything ok
self.add_log_details('invalid', True) self.add_log_details('invalid', True)
self.make_dangerous() self.make_dangerous()
for f in lodoc.infolist(): for f in lodoc.infolist():
@ -362,7 +353,7 @@ class File(FileBase):
self.add_log_details('processing_type', 'pdf') self.add_log_details('processing_type', 'pdf')
xmlDoc = PDFiD(self.src_path) xmlDoc = PDFiD(self.src_path)
oPDFiD = cPDFiD(xmlDoc, True) oPDFiD = cPDFiD(xmlDoc, True)
# TODO: other keywords? # TODO: are there other characteristics which should be dangerous?
if oPDFiD.encrypt.count > 0: if oPDFiD.encrypt.count > 0:
self.add_log_details('encrypted', True) self.add_log_details('encrypted', True)
self.make_dangerous() self.make_dangerous()
@ -399,6 +390,7 @@ class File(FileBase):
####################### #######################
# Metadata extractors # Metadata extractors
def _metadata_exif(self, metadata_file_path): def _metadata_exif(self, metadata_file_path):
# TODO: this method is kind of long, can we shorten it?
img = open(self.src_path, 'rb') img = open(self.src_path, 'rb')
tags = None tags = None
@ -486,7 +478,7 @@ class File(FileBase):
# TODO: make sure this method works for png, gif, tiff # TODO: make sure this method works for png, gif, tiff
if self.has_metadata(): if self.has_metadata():
self.extract_metadata() self.extract_metadata()
tempdir_path = self._make_tempdir() tempdir_path = self.make_tempdir()
tempfile_path = os.path.join(tempdir_path, self.filename) tempfile_path = os.path.join(tempdir_path, self.filename)
warnings.simplefilter('error', Image.DecompressionBombWarning) warnings.simplefilter('error', Image.DecompressionBombWarning)
try: # Do image conversions try: # Do image conversions
@ -495,6 +487,7 @@ class File(FileBase):
img_out.save(tempfile_path) img_out.save(tempfile_path)
self.src_path = tempfile_path self.src_path = tempfile_path
except Exception as e: # Catch decompression bombs except Exception as e: # Catch decompression bombs
# TODO: change this from all Exceptions to specific DecompressionBombWarning
# TODO: change this from printing to logging # TODO: change this from printing to logging
print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path))
print(e) print(e)
@ -512,7 +505,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
def process_dir(self, src_dir, dst_dir): def process_dir(self, src_dir, dst_dir):
"""Main function coordinating file processing.""" """Main function coordinating file processing."""
# TODO: Think we want to move write_log elsewhere: # TODO: we probably want to move this write_log elsewhere:
# if self.recursive_archive_depth > 0: # if self.recursive_archive_depth > 0:
# self.write_log() # self.write_log()
# TODO: Can we clean up the way we handle relative_path? # TODO: Can we clean up the way we handle relative_path?
@ -532,7 +525,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
if file.is_recursive: if file.is_recursive:
self.process_archive(file) self.process_archive(file)
else: else:
# TODO: Check if should be copied, make an attribute for should be copied True/False # TODO: Make a File attribute for should be copied True/False and check it
self._safe_copy() self._safe_copy()
file.write_log() file.write_log()
if hasattr(file, "tempdir_path"): if hasattr(file, "tempdir_path"):
@ -548,33 +541,34 @@ class KittenGroomerFileCheck(KittenGroomerBase):
if self.recursive_archive_depth >= self.max_recursive_depth: if self.recursive_archive_depth >= self.max_recursive_depth:
self._handle_archivebomb(file) self._handle_archivebomb(file)
else: else:
tempdir_path = file._make_tempdir() tempdir_path = file.make_tempdir()
# Unpack the archive command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
base_command = '{} -p1 x "{}" -o"{}" -bd -aoa' unpack_command = command_str.format(SEVENZ_PATH,
extract_command = base_command.format(SEVENZ_PATH, file.src_path, tempdir_path) file.src_path, tempdir_path)
file._run_process(extract_command) self._run_process(unpack_command)
# Add it to the tree # TODO: check that tree is working correctly here
self.logger.tree(tempdir_path) self.logger.tree(tempdir_path)
# List all files, process them
self.process_dir(tempdir_path, file.dst_path) self.process_dir(tempdir_path, file.dst_path)
# Clean up
self._safe_rmtree(tempdir_path) self._safe_rmtree(tempdir_path)
self.recursive_archive_depth -= 1 self.recursive_archive_depth -= 1
def _handle_archivebomb(self, file): def _handle_archivebomb(self, file):
file.make_dangerous() file.make_dangerous()
file.add_log_details('Archive Bomb', True) file.add_log_details('Archive Bomb', True)
self.logger.log.warning('ARCHIVE BOMB.') self.logger.log.warning('ARCHIVE BOMB.')
self.logger.log.warning('The content of the archive contains recursively other archives.') self.logger.log.warning('The content of the archive contains recursively other archives.')
self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.') self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.')
# TODO: are we sure we want to delete the archive on the source key? Commenting out for now # TODO: delete whatever we want to delete that's already been copied to dest dir
# self._safe_rmtree(file.src_dir)
# What is the goal of this code: def _run_process(self, command_string, timeout=None):
# if file.src_dir.endswith('_temp'): """Run command_string in a subprocess, wait until it finishes."""
# # TODO: change the way bomb_path is constructed and the way we check for tempdir args = shlex.split(command_string)
# bomb_path = file.src_dir[:-len('_temp')] with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout:
# self._safe_remove(bomb_path) try:
subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout)
except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
return
return True
def run(self): def run(self):
self.process_dir(self.src_root_dir, self.dst_root_dir) self.process_dir(self.src_root_dir, self.dst_root_dir)