Add/update docstrings for filecheck and helpers

pull/12/head
Dan Puttick 2017-03-15 22:29:51 -04:00
parent ac94cf5d6d
commit 4d8a1d1daf
2 changed files with 80 additions and 46 deletions

View File

@ -23,6 +23,8 @@ SEVENZ_PATH = '/usr/bin/7z'
class Config: class Config:
"""Configuration information for Filecheck."""
# Application subtypes (mimetype: 'application/<subtype>') # Application subtypes (mimetype: 'application/<subtype>')
mimes_ooxml = ['vnd.openxmlformats-officedocument.'] mimes_ooxml = ['vnd.openxmlformats-officedocument.']
mimes_office = ['msword', 'vnd.ms-'] mimes_office = ['msword', 'vnd.ms-']
@ -180,12 +182,13 @@ class File(FileBase):
@property @property
def has_metadata(self): def has_metadata(self):
"""True if filetype typically contains metadata, else False."""
if self.mimetype in Config.mimes_metadata: if self.mimetype in Config.mimes_metadata:
return True return True
return False return False
def make_tempdir(self): def make_tempdir(self):
"""Make a temporary directory.""" """Make a temporary directory at self.tempdir_path."""
self.tempdir_path = self.dst_path + '_temp' self.tempdir_path = self.dst_path + '_temp'
if not os.path.exists(self.tempdir_path): if not os.path.exists(self.tempdir_path):
os.makedirs(self.tempdir_path) os.makedirs(self.tempdir_path)
@ -246,7 +249,7 @@ class File(FileBase):
self.force_ext('.txt') self.force_ext('.txt')
def application(self): def application(self):
"""Processes an application specific file according to its subtype.""" """Process an application specific file according to its subtype."""
for subtype, method in self.app_subtype_methods.items(): for subtype, method in self.app_subtype_methods.items():
if subtype in self.sub_type: if subtype in self.sub_type:
# TODO: should we change the logic so we don't iterate through all of the subtype methods? # TODO: should we change the logic so we don't iterate through all of the subtype methods?
@ -258,13 +261,13 @@ class File(FileBase):
self._unknown_app() self._unknown_app()
def _executables(self): def _executables(self):
"""Processes an executable file.""" """Process an executable file."""
# LOG: change the processing_type property to some other name or include in file_string # LOG: change the processing_type property to some other name or include in file_string
self.set_property('processing_type', 'executable') self.set_property('processing_type', 'executable')
self.make_dangerous('executable') self.make_dangerous('executable')
def _winoffice(self): def _winoffice(self):
"""Processes a winoffice file using olefile/oletools.""" """Process a winoffice file using olefile/oletools."""
# LOG: processing_type property # LOG: processing_type property
self.set_property('processing_type', 'WinOffice') self.set_property('processing_type', 'WinOffice')
oid = oletools.oleid.OleID(self.src_path) # First assume a valid file oid = oletools.oleid.OleID(self.src_path) # First assume a valid file
@ -297,7 +300,7 @@ class File(FileBase):
self.make_dangerous('flash') self.make_dangerous('flash')
def _ooxml(self): def _ooxml(self):
"""Processes an ooxml file.""" """Process an ooxml file."""
# LOG: processing_type property # LOG: processing_type property
self.set_property('processing_type', 'ooxml') self.set_property('processing_type', 'ooxml')
try: try:
@ -318,7 +321,7 @@ class File(FileBase):
self.make_dangerous('embedded pack') self.make_dangerous('embedded pack')
def _libreoffice(self): def _libreoffice(self):
"""Processes a libreoffice file.""" """Process a libreoffice file."""
self.set_property('processing_type', 'libreoffice') self.set_property('processing_type', 'libreoffice')
# As long as there is no way to do a sanity check on the files => dangerous # As long as there is no way to do a sanity check on the files => dangerous
try: try:
@ -333,7 +336,7 @@ class File(FileBase):
self.make_dangerous('macro') self.make_dangerous('macro')
def _pdf(self): def _pdf(self):
"""Processes a PDF file.""" """Process a PDF file."""
# LOG: processing_type property # LOG: processing_type property
self.set_property('processing_type', 'pdf') self.set_property('processing_type', 'pdf')
xmlDoc = PDFiD(self.src_path) xmlDoc = PDFiD(self.src_path)
@ -351,26 +354,30 @@ class File(FileBase):
self.make_dangerous('launch') self.make_dangerous('launch')
def _archive(self): def _archive(self):
"""Processes an archive using 7zip. The archive is extracted to a """
temporary directory and self.process_dir is called on that directory. Process an archive using 7zip.
The recursive archive depth is increased to protect against archive
bombs.""" The archive is extracted to a temporary directory and self.process_dir
is called on that directory. The recursive archive depth is increased
to protect against archive bombs.
"""
# LOG: change this to something archive specific # LOG: change this to something archive specific
self.set_property('processing_type', 'archive') self.set_property('processing_type', 'archive')
self.should_copy = False self.should_copy = False
self.is_recursive = True self.is_recursive = True
def _unknown_app(self): def _unknown_app(self):
"""Processes an unknown file.""" """Process an unknown file."""
self.make_unknown() self.make_unknown()
def _binary_app(self): def _binary_app(self):
"""Processses an unknown binary file.""" """Process an unknown binary file."""
self.make_binary() self.make_binary()
####################### #######################
# Metadata extractors # Metadata extractors
def _metadata_exif(self, metadata_file_path): def _metadata_exif(self, metadata_file_path):
"""Read exif metadata from a jpg or tiff file using exifread."""
# TODO: this method is kind of long, can we shorten it somehow? # TODO: this method is kind of long, can we shorten it somehow?
img = open(self.src_path, 'rb') img = open(self.src_path, 'rb')
tags = None tags = None
@ -401,6 +408,7 @@ class File(FileBase):
return True return True
def _metadata_png(self, metadata_file_path): def _metadata_png(self, metadata_file_path):
"""Extract metadata from a png file using PIL/Pillow."""
warnings.simplefilter('error', Image.DecompressionBombWarning) warnings.simplefilter('error', Image.DecompressionBombWarning)
try: try:
img = Image.open(self.src_path) img = Image.open(self.src_path)
@ -420,6 +428,7 @@ class File(FileBase):
return False return False
def extract_metadata(self): def extract_metadata(self):
"""Create metadata file and call correct metadata extraction method."""
metadata_file_path = self.create_metadata_file(".metadata.txt") metadata_file_path = self.create_metadata_file(".metadata.txt")
mt = self.mimetype mt = self.mimetype
metadata_processing_method = self.metadata_mimetype_methods.get(mt) metadata_processing_method = self.metadata_mimetype_methods.get(mt)
@ -430,12 +439,12 @@ class File(FileBase):
####################### #######################
# ##### Media - audio and video aren't converted ###### # ##### Media - audio and video aren't converted ######
def audio(self): def audio(self):
"""Processes an audio file.""" """Process an audio file."""
self.log_string += 'Audio file' self.log_string += 'Audio file'
self._media_processing() self._media_processing()
def video(self): def video(self):
"""Processes a video.""" """Process a video."""
self.log_string += 'Video file' self.log_string += 'Video file'
self._media_processing() self._media_processing()
@ -444,11 +453,14 @@ class File(FileBase):
self.set_property('processing_type', 'media') self.set_property('processing_type', 'media')
def image(self): def image(self):
"""Processes an image. """
Process an image.
Extracts metadata to dest key if metadata is present. Creates a Extracts metadata to dest key using self.extract_metada() if metadata
temporary directory on dest key, opens the using PIL.Image,saves it to is present. Creates a temporary directory on dest key, opens the image
the temporary directory, and copies it to the destination.""" using PIL.Image, saves it to the temporary directory, and copies it to
the destination.
"""
# TODO: make sure this method works for png, gif, tiff # TODO: make sure this method works for png, gif, tiff
if self.has_metadata: if self.has_metadata:
self.extract_metadata() self.extract_metadata()
@ -476,7 +488,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.max_recursive_depth = max_recursive_depth self.max_recursive_depth = max_recursive_depth
def process_dir(self, src_dir, dst_dir): def process_dir(self, src_dir, dst_dir):
"""Main function coordinating file processing.""" """Process a directory on the source key."""
self.logger.tree(src_dir) self.logger.tree(src_dir)
for srcpath in self.list_all_files(src_dir): for srcpath in self.list_all_files(src_dir):
dstpath = srcpath.replace(src_dir, dst_dir) dstpath = srcpath.replace(src_dir, dst_dir)
@ -489,6 +501,12 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.process_file(self.cur_file) self.process_file(self.cur_file)
def process_file(self, file): def process_file(self, file):
"""
Process an individual file.
Check the file, handle archives using self.process_archive, copy
the file to the destionation key, and clean up temporary directory.
"""
file.check() file.check()
if file.is_recursive: if file.is_recursive:
self.process_archive(file) self.process_archive(file)
@ -500,10 +518,12 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.safe_rmtree(file.tempdir_path) self.safe_rmtree(file.tempdir_path)
def process_archive(self, file): def process_archive(self, file):
"""Unpacks an archive using 7zip and processes contents. """
Unpack an archive using 7zip and process contents using process_dir.
Should be given a Kittengroomer file object whose src_path points Should be given a Kittengroomer file object whose src_path points
to an archive.""" to an archive.
"""
self.recursive_archive_depth += 1 self.recursive_archive_depth += 1
# LOG: write_log or somehow log the archive file here # LOG: write_log or somehow log the archive file here
if self.recursive_archive_depth >= self.max_recursive_depth: if self.recursive_archive_depth >= self.max_recursive_depth:

View File

@ -32,13 +32,18 @@ class ImplementationRequired(KittenGroomerError):
class FileBase(object): class FileBase(object):
""" """
Base object for individual files in the source directory. Contains file Base object for individual files in the source directory.
attributes and various helper methods. Subclass and add attributes
or methods relevant to a given implementation. Contains file attributes and various helper methods.
""" """
def __init__(self, src_path, dst_path, logger=None): def __init__(self, src_path, dst_path, logger=None):
"""Initialized with the source path and expected destination path.""" """
Initialized with the source path and expected destination path.
self.logger should be a logging object with an add_file method.
Create various properties and determine the file's mimetype.
"""
self.src_path = src_path self.src_path = src_path
self.dst_path = dst_path self.dst_path = dst_path
self.filename = os.path.basename(self.src_path) self.filename = os.path.basename(self.src_path)
@ -106,6 +111,7 @@ class FileBase(object):
@property @property
def size(self): def size(self):
"""Filesize in bytes as an int, 0 if file does not exist."""
try: try:
size = os.path.getsize(self.src_path) size = os.path.getsize(self.src_path)
except FileNotFoundError: except FileNotFoundError:
@ -114,7 +120,7 @@ class FileBase(object):
@property @property
def has_mimetype(self): def has_mimetype(self):
"""Returns True if file has a full mimetype, else False.""" """True if file has a main and sub mimetype, else False."""
# TODO: broken mimetype checks should be done somewhere else. # TODO: broken mimetype checks should be done somewhere else.
# Should the check be by default or should we let the API consumer write it? # Should the check be by default or should we let the API consumer write it?
if not self.main_type or not self.sub_type: if not self.main_type or not self.sub_type:
@ -124,7 +130,7 @@ class FileBase(object):
@property @property
def has_extension(self): def has_extension(self):
"""Returns True if self.extension is set, else False.""" """True if self.extension is set, else False."""
if self.extension is None: if self.extension is None:
return False return False
else: else:
@ -132,35 +138,42 @@ class FileBase(object):
@property @property
def is_dangerous(self): def is_dangerous(self):
"""True if file has been marked 'dangerous' else False.""" """True if file has been marked 'dangerous', else False."""
return self._file_props['safety_category'] is 'dangerous' return self._file_props['safety_category'] is 'dangerous'
@property @property
def is_unknown(self): def is_unknown(self):
"""True if file has been marked 'unknown' else False.""" """True if file has been marked 'unknown', else False."""
return self._file_props['safety_category'] is 'unknown' return self._file_props['safety_category'] is 'unknown'
@property @property
def is_binary(self): def is_binary(self):
"""True if file has been marked 'binary' else False.""" """True if file has been marked 'binary', else False."""
return self._file_props['safety_category'] is 'binary' return self._file_props['safety_category'] is 'binary'
@property @property
def is_symlink(self): def is_symlink(self):
"""Returns True and updates log if file is a symlink.""" """True if file is a symlink, else False."""
if self._file_props['symlink'] is False: if self._file_props['symlink'] is False:
return False return False
else: else:
return True return True
def set_property(self, prop_string, value): def set_property(self, prop_string, value):
"""Takes a property + a value and adds them to self._file_props.""" """
Take a property and a value and add them to self._file_props.
If prop_string is already in _file_props, set prop_string to value.
If prop_string not in _file_props, set prop_string to value in
_file_props['user_defined'].
"""
if prop_string in self._file_props.keys(): if prop_string in self._file_props.keys():
self._file_props[prop_string] = value self._file_props[prop_string] = value
else: else:
self._file_props['user_defined'][prop_string] = value self._file_props['user_defined'][prop_string] = value
def get_property(self, file_prop): def get_property(self, file_prop):
"""Get the value for a property in _file_props."""
# TODO: could probably be refactored # TODO: could probably be refactored
if file_prop in self._file_props: if file_prop in self._file_props:
return self._file_props[file_prop] return self._file_props[file_prop]
@ -170,16 +183,18 @@ class FileBase(object):
return None return None
def add_error(self, error, info): def add_error(self, error, info):
"""Add an error: info pair to _file_props['errors']."""
self._file_props['errors'].update({error: info}) self._file_props['errors'].update({error: info})
def add_file_string(self, file_string): def add_file_string(self, file_string):
"""Add a file descriptor string to _file_props."""
self._file_props['file_string_set'].add(file_string) self._file_props['file_string_set'].add(file_string)
def make_dangerous(self, reason_string=None): def make_dangerous(self, reason_string=None):
""" """
Marks a file as dangerous. Mark file as dangerous.
Prepends and appends DANGEROUS to the destination file name Prepend and append DANGEROUS to the destination file name
to help prevent double-click of death. to help prevent double-click of death.
""" """
if self.is_dangerous: if self.is_dangerous:
@ -190,7 +205,7 @@ class FileBase(object):
self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename)) self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename))
def make_unknown(self): def make_unknown(self):
"""Marks a file as an unknown type and prepends UNKNOWN to filename.""" """Mark file as an unknown type and prepend UNKNOWN to filename."""
if self.is_dangerous or self.is_binary: if self.is_dangerous or self.is_binary:
return return
self.set_property('safety_category', 'unknown') self.set_property('safety_category', 'unknown')
@ -198,7 +213,7 @@ class FileBase(object):
self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename)) self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename))
def make_binary(self): def make_binary(self):
"""Marks a file as a binary and appends .bin to filename.""" """Mark file as a binary and append .bin to filename."""
if self.is_dangerous: if self.is_dangerous:
return return
self.set_property('safety_category', 'binary') self.set_property('safety_category', 'binary')
@ -206,7 +221,7 @@ class FileBase(object):
self.dst_path = os.path.join(path, '{}.bin'.format(filename)) self.dst_path = os.path.join(path, '{}.bin'.format(filename))
def safe_copy(self, src=None, dst=None): def safe_copy(self, src=None, dst=None):
"""Copy a file and create directory if needed.""" """Copy file and create destination directories if needed."""
if src is None: if src is None:
src = self.src_path src = self.src_path
if dst is None: if dst is None:
@ -220,7 +235,7 @@ class FileBase(object):
self.add_error(e, '') self.add_error(e, '')
def force_ext(self, ext): def force_ext(self, ext):
"""If dst_path does not end in ext, changes it and edits _file_props.""" """If dst_path does not end in ext, change it and edit _file_props."""
if not self.dst_path.endswith(ext): if not self.dst_path.endswith(ext):
self.set_property('force_ext', True) self.set_property('force_ext', True)
self.dst_path += ext self.dst_path += ext
@ -228,7 +243,7 @@ class FileBase(object):
self.set_property('extension', ext) self.set_property('extension', ext)
def create_metadata_file(self, ext): def create_metadata_file(self, ext):
"""Create a separate file to hold this file's metadata.""" """Create a separate file to hold metadata from this file."""
try: try:
# make sure we aren't overwriting anything # make sure we aren't overwriting anything
if os.path.exists(self.src_path + ext): if os.path.exists(self.src_path + ext):
@ -247,7 +262,7 @@ class FileBase(object):
return False return False
def write_log(self): def write_log(self):
"""Print the logs related to the current file being processed.""" """Write logs from file to self.logger."""
file_log = self.logger.add_file(self) file_log = self.logger.add_file(self)
file_log.fields(**self._file_props) file_log.fields(**self._file_props)
@ -273,7 +288,7 @@ class GroomerLogger(object):
self.log_debug_out = os.devnull self.log_debug_out = os.devnull
def tree(self, base_dir, padding=' '): def tree(self, base_dir, padding=' '):
"""Writes a graphical tree to the log for a given directory.""" """Write a graphical tree to the log for `base_dir`."""
with open(self.log_content, 'ab') as lf: with open(self.log_content, 'ab') as lf:
lf.write(bytes('#' * 80 + '\n', 'UTF-8')) lf.write(bytes('#' * 80 + '\n', 'UTF-8'))
lf.write(bytes('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)).encode()), 'utf8')) lf.write(bytes('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)).encode()), 'utf8'))
@ -289,7 +304,7 @@ class GroomerLogger(object):
lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)).encode(errors='ignore')) lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)).encode(errors='ignore'))
def _computehash(self, path): def _computehash(self, path):
"""Returns a sha256 hash of a file at a given path.""" """Return a sha256 hash of a file at a given path."""
s = hashlib.sha256() s = hashlib.sha256()
with open(path, 'rb') as f: with open(path, 'rb') as f:
while True: while True:
@ -300,6 +315,7 @@ class GroomerLogger(object):
return s.hexdigest() return s.hexdigest()
def add_file(self, file): def add_file(self, file):
"""Add a file to the log."""
return self.log.name('file.src_path') return self.log.name('file.src_path')
@ -340,9 +356,7 @@ class KittenGroomerBase(object):
# TODO: feels like this function doesn't need to exist if we move main() # TODO: feels like this function doesn't need to exist if we move main()
def processdir(self, src_dir, dst_dir): def processdir(self, src_dir, dst_dir):
""" """Implement this function to define file processing behavior."""
Implement this function in your subclass to define file processing behavior.
"""
raise ImplementationRequired('Please implement processdir.') raise ImplementationRequired('Please implement processdir.')