From ca90a08159e93f32569b652353dd8466d025ace3 Mon Sep 17 00:00:00 2001 From: Eleanor Saitta Date: Wed, 9 Dec 2015 20:26:26 -0500 Subject: [PATCH] Initial working version of EXIF splitting and image format validation by round-trip conversion. --- bin/filecheck.py | 43 +++++++++++++++++++++++++++++++++++++++- kittengroomer/helpers.py | 22 +++++++++++++++++--- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 07cc6fd..17c73ca 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -11,6 +11,10 @@ import oletools.oleid import olefile import officedissector +import warnings +import exifread +from PIL import Image + from pdfid import PDFiD, cPDFiD from kittengroomer import FileBase, KittenGroomerBase, main @@ -30,6 +34,9 @@ mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop', 'xz', 'compress', 'gzip', 'tar'] mimes_data = ['octet-stream'] +# Prepare image/ +mimes_metadata = ['jpeg', 'tiff'] + # Aliases aliases = { # Win executables @@ -120,6 +127,11 @@ class File(FileBase): # there are no known extensions associated to this mimetype. pass + def has_image_metadata(self): + if self.sub_type in mimes_metadata: + return True + return False + class KittenGroomerFileCheck(KittenGroomerBase): @@ -410,8 +422,37 @@ class KittenGroomerFileCheck(KittenGroomerBase): def image(self): '''Way to process an image''' + # Extract the metadata + if self.cur_file.has_image_metadata(): + metadataFile = self._safe_metadata_split(".exif") + f = open(self.cur_file.src_path, 'rb') + tags = exifread.process_file(f) # TODO: Switch to PyExifTool for raw, etc. support? + for tag in sorted(tags.keys()): + if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'EXIF MakerNote'): + metadataFile.write("Key: {}\tValue: {}\n".format(tag, tags[tag])) + metadataFile.close() + f.close() + self.cur_file.add_log_details('metadata', 'exif') + + # Create a temp directory + dst_dir, filename = os.path.split(self.cur_file.dst_path) + tmpdir = os.path.join(dst_dir, 'temp') + tmppath = os.path.join(tmpdir, filename) + self._safe_mkdir(tmpdir) + + # Do our image conversions + warnings.simplefilter('error', Image.DecompressionBombWarning) + imIn = Image.open(self.cur_file.src_path) + imOut = Image.frombytes(imIn.mode, imIn.size, imIn.tobytes()) + imOut.save(tmppath) + + #Copy the file back out and cleanup + self._safe_copy(tmppath) + self._safe_rmtree(tmpdir) + self.cur_file.log_string += 'Image file' - self._media_processing() + self.cur_file.add_log_details('processing_type', 'image') + def video(self): '''Way to process a video''' diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 5c578a7..90c7919 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -194,12 +194,12 @@ class KittenGroomerBase(object): os.remove(filepath) def _safe_mkdir(self, directory): - '''Remove a directory if it exists''' + '''Make a directory if it does not exist''' if not os.path.exists(directory): os.makedirs(directory) def _safe_copy(self, src=None, dst=None): - ''' Copy a file and create directory if needed ''' + ''' Copy a file and create directory if needed''' if src is None: src = self.cur_file.src_path if dst is None: @@ -214,8 +214,24 @@ class KittenGroomerBase(object): print(e) return False + def _safe_metadata_split(self, ext): + '''Create a separate file to hold this file's metadata''' + dst = self.cur_file.dst_path + try: + if os.path.exists(self.cur_file.src_path+ext): + raise KittenGroomerError("Cannot create split metadata file for \"" + + self.cur_file.dst_path + "\", type '" + + ext + "': File exists.") + dst_path, filename = os.path.split(dst) + self._safe_mkdir(dst_path) + return open(dst+ext, 'w+') + except Exception as e: + # TODO: Logfile + print(e) + return False + def _list_all_files(self, directory): - ''' Generate an iterator over all the files in a directory tree ''' + ''' Generate an iterator over all the files in a directory tree''' for root, dirs, files in os.walk(directory): for filename in files: filepath = os.path.join(root, filename)