From 4d8a1d1dafd7ef453e5a8653215c7ab214edcad6 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 15 Mar 2017 22:29:51 -0400 Subject: [PATCH] Add/update docstrings for filecheck and helpers --- bin/filecheck.py | 64 ++++++++++++++++++++++++++-------------- kittengroomer/helpers.py | 62 +++++++++++++++++++++++--------------- 2 files changed, 80 insertions(+), 46 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 6513442..3dc26dd 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -23,6 +23,8 @@ SEVENZ_PATH = '/usr/bin/7z' class Config: + """Configuration information for Filecheck.""" + # Application subtypes (mimetype: 'application/') mimes_ooxml = ['vnd.openxmlformats-officedocument.'] mimes_office = ['msword', 'vnd.ms-'] @@ -180,12 +182,13 @@ class File(FileBase): @property def has_metadata(self): + """True if filetype typically contains metadata, else False.""" if self.mimetype in Config.mimes_metadata: return True return False def make_tempdir(self): - """Make a temporary directory.""" + """Make a temporary directory at self.tempdir_path.""" self.tempdir_path = self.dst_path + '_temp' if not os.path.exists(self.tempdir_path): os.makedirs(self.tempdir_path) @@ -246,7 +249,7 @@ class File(FileBase): self.force_ext('.txt') def application(self): - """Processes an application specific file according to its subtype.""" + """Process an application specific file according to its subtype.""" for subtype, method in self.app_subtype_methods.items(): if subtype in self.sub_type: # TODO: should we change the logic so we don't iterate through all of the subtype methods? @@ -258,13 +261,13 @@ class File(FileBase): self._unknown_app() def _executables(self): - """Processes an executable file.""" + """Process an executable file.""" # LOG: change the processing_type property to some other name or include in file_string self.set_property('processing_type', 'executable') self.make_dangerous('executable') def _winoffice(self): - """Processes a winoffice file using olefile/oletools.""" + """Process a winoffice file using olefile/oletools.""" # LOG: processing_type property self.set_property('processing_type', 'WinOffice') oid = oletools.oleid.OleID(self.src_path) # First assume a valid file @@ -297,7 +300,7 @@ class File(FileBase): self.make_dangerous('flash') def _ooxml(self): - """Processes an ooxml file.""" + """Process an ooxml file.""" # LOG: processing_type property self.set_property('processing_type', 'ooxml') try: @@ -318,7 +321,7 @@ class File(FileBase): self.make_dangerous('embedded pack') def _libreoffice(self): - """Processes a libreoffice file.""" + """Process a libreoffice file.""" self.set_property('processing_type', 'libreoffice') # As long as there is no way to do a sanity check on the files => dangerous try: @@ -333,7 +336,7 @@ class File(FileBase): self.make_dangerous('macro') def _pdf(self): - """Processes a PDF file.""" + """Process a PDF file.""" # LOG: processing_type property self.set_property('processing_type', 'pdf') xmlDoc = PDFiD(self.src_path) @@ -351,26 +354,30 @@ class File(FileBase): self.make_dangerous('launch') def _archive(self): - """Processes an archive using 7zip. The archive is extracted to a - temporary directory and self.process_dir is called on that directory. - The recursive archive depth is increased to protect against archive - bombs.""" + """ + Process an archive using 7zip. + + The archive is extracted to a temporary directory and self.process_dir + is called on that directory. The recursive archive depth is increased + to protect against archive bombs. + """ # LOG: change this to something archive specific self.set_property('processing_type', 'archive') self.should_copy = False self.is_recursive = True def _unknown_app(self): - """Processes an unknown file.""" + """Process an unknown file.""" self.make_unknown() def _binary_app(self): - """Processses an unknown binary file.""" + """Process an unknown binary file.""" self.make_binary() ####################### # Metadata extractors def _metadata_exif(self, metadata_file_path): + """Read exif metadata from a jpg or tiff file using exifread.""" # TODO: this method is kind of long, can we shorten it somehow? img = open(self.src_path, 'rb') tags = None @@ -401,6 +408,7 @@ class File(FileBase): return True def _metadata_png(self, metadata_file_path): + """Extract metadata from a png file using PIL/Pillow.""" warnings.simplefilter('error', Image.DecompressionBombWarning) try: img = Image.open(self.src_path) @@ -420,6 +428,7 @@ class File(FileBase): return False def extract_metadata(self): + """Create metadata file and call correct metadata extraction method.""" metadata_file_path = self.create_metadata_file(".metadata.txt") mt = self.mimetype metadata_processing_method = self.metadata_mimetype_methods.get(mt) @@ -430,12 +439,12 @@ class File(FileBase): ####################### # ##### Media - audio and video aren't converted ###### def audio(self): - """Processes an audio file.""" + """Process an audio file.""" self.log_string += 'Audio file' self._media_processing() def video(self): - """Processes a video.""" + """Process a video.""" self.log_string += 'Video file' self._media_processing() @@ -444,11 +453,14 @@ class File(FileBase): self.set_property('processing_type', 'media') def image(self): - """Processes an image. + """ + Process an image. - Extracts metadata to dest key if metadata is present. Creates a - temporary directory on dest key, opens the using PIL.Image,saves it to - the temporary directory, and copies it to the destination.""" + Extracts metadata to dest key using self.extract_metada() if metadata + is present. Creates a temporary directory on dest key, opens the image + using PIL.Image, saves it to the temporary directory, and copies it to + the destination. + """ # TODO: make sure this method works for png, gif, tiff if self.has_metadata: self.extract_metadata() @@ -476,7 +488,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.max_recursive_depth = max_recursive_depth def process_dir(self, src_dir, dst_dir): - """Main function coordinating file processing.""" + """Process a directory on the source key.""" self.logger.tree(src_dir) for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) @@ -489,6 +501,12 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.process_file(self.cur_file) def process_file(self, file): + """ + Process an individual file. + + Check the file, handle archives using self.process_archive, copy + the file to the destionation key, and clean up temporary directory. + """ file.check() if file.is_recursive: self.process_archive(file) @@ -500,10 +518,12 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.safe_rmtree(file.tempdir_path) def process_archive(self, file): - """Unpacks an archive using 7zip and processes contents. + """ + Unpack an archive using 7zip and process contents using process_dir. Should be given a Kittengroomer file object whose src_path points - to an archive.""" + to an archive. + """ self.recursive_archive_depth += 1 # LOG: write_log or somehow log the archive file here if self.recursive_archive_depth >= self.max_recursive_depth: diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 465ad4a..318f1ca 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -32,13 +32,18 @@ class ImplementationRequired(KittenGroomerError): class FileBase(object): """ - Base object for individual files in the source directory. Contains file - attributes and various helper methods. Subclass and add attributes - or methods relevant to a given implementation. + Base object for individual files in the source directory. + + Contains file attributes and various helper methods. """ def __init__(self, src_path, dst_path, logger=None): - """Initialized with the source path and expected destination path.""" + """ + Initialized with the source path and expected destination path. + + self.logger should be a logging object with an add_file method. + Create various properties and determine the file's mimetype. + """ self.src_path = src_path self.dst_path = dst_path self.filename = os.path.basename(self.src_path) @@ -106,6 +111,7 @@ class FileBase(object): @property def size(self): + """Filesize in bytes as an int, 0 if file does not exist.""" try: size = os.path.getsize(self.src_path) except FileNotFoundError: @@ -114,7 +120,7 @@ class FileBase(object): @property def has_mimetype(self): - """Returns True if file has a full mimetype, else False.""" + """True if file has a main and sub mimetype, else False.""" # TODO: broken mimetype checks should be done somewhere else. # Should the check be by default or should we let the API consumer write it? if not self.main_type or not self.sub_type: @@ -124,7 +130,7 @@ class FileBase(object): @property def has_extension(self): - """Returns True if self.extension is set, else False.""" + """True if self.extension is set, else False.""" if self.extension is None: return False else: @@ -132,35 +138,42 @@ class FileBase(object): @property def is_dangerous(self): - """True if file has been marked 'dangerous' else False.""" + """True if file has been marked 'dangerous', else False.""" return self._file_props['safety_category'] is 'dangerous' @property def is_unknown(self): - """True if file has been marked 'unknown' else False.""" + """True if file has been marked 'unknown', else False.""" return self._file_props['safety_category'] is 'unknown' @property def is_binary(self): - """True if file has been marked 'binary' else False.""" + """True if file has been marked 'binary', else False.""" return self._file_props['safety_category'] is 'binary' @property def is_symlink(self): - """Returns True and updates log if file is a symlink.""" + """True if file is a symlink, else False.""" if self._file_props['symlink'] is False: return False else: return True def set_property(self, prop_string, value): - """Takes a property + a value and adds them to self._file_props.""" + """ + Take a property and a value and add them to self._file_props. + + If prop_string is already in _file_props, set prop_string to value. + If prop_string not in _file_props, set prop_string to value in + _file_props['user_defined']. + """ if prop_string in self._file_props.keys(): self._file_props[prop_string] = value else: self._file_props['user_defined'][prop_string] = value def get_property(self, file_prop): + """Get the value for a property in _file_props.""" # TODO: could probably be refactored if file_prop in self._file_props: return self._file_props[file_prop] @@ -170,16 +183,18 @@ class FileBase(object): return None def add_error(self, error, info): + """Add an error: info pair to _file_props['errors'].""" self._file_props['errors'].update({error: info}) def add_file_string(self, file_string): + """Add a file descriptor string to _file_props.""" self._file_props['file_string_set'].add(file_string) def make_dangerous(self, reason_string=None): """ - Marks a file as dangerous. + Mark file as dangerous. - Prepends and appends DANGEROUS to the destination file name + Prepend and append DANGEROUS to the destination file name to help prevent double-click of death. """ if self.is_dangerous: @@ -190,7 +205,7 @@ class FileBase(object): self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename)) def make_unknown(self): - """Marks a file as an unknown type and prepends UNKNOWN to filename.""" + """Mark file as an unknown type and prepend UNKNOWN to filename.""" if self.is_dangerous or self.is_binary: return self.set_property('safety_category', 'unknown') @@ -198,7 +213,7 @@ class FileBase(object): self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename)) def make_binary(self): - """Marks a file as a binary and appends .bin to filename.""" + """Mark file as a binary and append .bin to filename.""" if self.is_dangerous: return self.set_property('safety_category', 'binary') @@ -206,7 +221,7 @@ class FileBase(object): self.dst_path = os.path.join(path, '{}.bin'.format(filename)) def safe_copy(self, src=None, dst=None): - """Copy a file and create directory if needed.""" + """Copy file and create destination directories if needed.""" if src is None: src = self.src_path if dst is None: @@ -220,7 +235,7 @@ class FileBase(object): self.add_error(e, '') def force_ext(self, ext): - """If dst_path does not end in ext, changes it and edits _file_props.""" + """If dst_path does not end in ext, change it and edit _file_props.""" if not self.dst_path.endswith(ext): self.set_property('force_ext', True) self.dst_path += ext @@ -228,7 +243,7 @@ class FileBase(object): self.set_property('extension', ext) def create_metadata_file(self, ext): - """Create a separate file to hold this file's metadata.""" + """Create a separate file to hold metadata from this file.""" try: # make sure we aren't overwriting anything if os.path.exists(self.src_path + ext): @@ -247,7 +262,7 @@ class FileBase(object): return False def write_log(self): - """Print the logs related to the current file being processed.""" + """Write logs from file to self.logger.""" file_log = self.logger.add_file(self) file_log.fields(**self._file_props) @@ -273,7 +288,7 @@ class GroomerLogger(object): self.log_debug_out = os.devnull def tree(self, base_dir, padding=' '): - """Writes a graphical tree to the log for a given directory.""" + """Write a graphical tree to the log for `base_dir`.""" with open(self.log_content, 'ab') as lf: lf.write(bytes('#' * 80 + '\n', 'UTF-8')) lf.write(bytes('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)).encode()), 'utf8')) @@ -289,7 +304,7 @@ class GroomerLogger(object): lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)).encode(errors='ignore')) def _computehash(self, path): - """Returns a sha256 hash of a file at a given path.""" + """Return a sha256 hash of a file at a given path.""" s = hashlib.sha256() with open(path, 'rb') as f: while True: @@ -300,6 +315,7 @@ class GroomerLogger(object): return s.hexdigest() def add_file(self, file): + """Add a file to the log.""" return self.log.name('file.src_path') @@ -340,9 +356,7 @@ class KittenGroomerBase(object): # TODO: feels like this function doesn't need to exist if we move main() def processdir(self, src_dir, dst_dir): - """ - Implement this function in your subclass to define file processing behavior. - """ + """Implement this function to define file processing behavior.""" raise ImplementationRequired('Please implement processdir.')