Add/update docstrings for filecheck and helpers

pull/12/head
Dan Puttick 2017-03-15 22:29:51 -04:00
parent ac94cf5d6d
commit 4d8a1d1daf
2 changed files with 80 additions and 46 deletions

View File

@ -23,6 +23,8 @@ SEVENZ_PATH = '/usr/bin/7z'
class Config:
"""Configuration information for Filecheck."""
# Application subtypes (mimetype: 'application/<subtype>')
mimes_ooxml = ['vnd.openxmlformats-officedocument.']
mimes_office = ['msword', 'vnd.ms-']
@ -180,12 +182,13 @@ class File(FileBase):
@property
def has_metadata(self):
"""True if filetype typically contains metadata, else False."""
if self.mimetype in Config.mimes_metadata:
return True
return False
def make_tempdir(self):
"""Make a temporary directory."""
"""Make a temporary directory at self.tempdir_path."""
self.tempdir_path = self.dst_path + '_temp'
if not os.path.exists(self.tempdir_path):
os.makedirs(self.tempdir_path)
@ -246,7 +249,7 @@ class File(FileBase):
self.force_ext('.txt')
def application(self):
"""Processes an application specific file according to its subtype."""
"""Process an application specific file according to its subtype."""
for subtype, method in self.app_subtype_methods.items():
if subtype in self.sub_type:
# TODO: should we change the logic so we don't iterate through all of the subtype methods?
@ -258,13 +261,13 @@ class File(FileBase):
self._unknown_app()
def _executables(self):
"""Processes an executable file."""
"""Process an executable file."""
# LOG: change the processing_type property to some other name or include in file_string
self.set_property('processing_type', 'executable')
self.make_dangerous('executable')
def _winoffice(self):
"""Processes a winoffice file using olefile/oletools."""
"""Process a winoffice file using olefile/oletools."""
# LOG: processing_type property
self.set_property('processing_type', 'WinOffice')
oid = oletools.oleid.OleID(self.src_path) # First assume a valid file
@ -297,7 +300,7 @@ class File(FileBase):
self.make_dangerous('flash')
def _ooxml(self):
"""Processes an ooxml file."""
"""Process an ooxml file."""
# LOG: processing_type property
self.set_property('processing_type', 'ooxml')
try:
@ -318,7 +321,7 @@ class File(FileBase):
self.make_dangerous('embedded pack')
def _libreoffice(self):
"""Processes a libreoffice file."""
"""Process a libreoffice file."""
self.set_property('processing_type', 'libreoffice')
# As long as there is no way to do a sanity check on the files => dangerous
try:
@ -333,7 +336,7 @@ class File(FileBase):
self.make_dangerous('macro')
def _pdf(self):
"""Processes a PDF file."""
"""Process a PDF file."""
# LOG: processing_type property
self.set_property('processing_type', 'pdf')
xmlDoc = PDFiD(self.src_path)
@ -351,26 +354,30 @@ class File(FileBase):
self.make_dangerous('launch')
def _archive(self):
"""Processes an archive using 7zip. The archive is extracted to a
temporary directory and self.process_dir is called on that directory.
The recursive archive depth is increased to protect against archive
bombs."""
"""
Process an archive using 7zip.
The archive is extracted to a temporary directory and self.process_dir
is called on that directory. The recursive archive depth is increased
to protect against archive bombs.
"""
# LOG: change this to something archive specific
self.set_property('processing_type', 'archive')
self.should_copy = False
self.is_recursive = True
def _unknown_app(self):
"""Processes an unknown file."""
"""Process an unknown file."""
self.make_unknown()
def _binary_app(self):
"""Processses an unknown binary file."""
"""Process an unknown binary file."""
self.make_binary()
#######################
# Metadata extractors
def _metadata_exif(self, metadata_file_path):
"""Read exif metadata from a jpg or tiff file using exifread."""
# TODO: this method is kind of long, can we shorten it somehow?
img = open(self.src_path, 'rb')
tags = None
@ -401,6 +408,7 @@ class File(FileBase):
return True
def _metadata_png(self, metadata_file_path):
"""Extract metadata from a png file using PIL/Pillow."""
warnings.simplefilter('error', Image.DecompressionBombWarning)
try:
img = Image.open(self.src_path)
@ -420,6 +428,7 @@ class File(FileBase):
return False
def extract_metadata(self):
"""Create metadata file and call correct metadata extraction method."""
metadata_file_path = self.create_metadata_file(".metadata.txt")
mt = self.mimetype
metadata_processing_method = self.metadata_mimetype_methods.get(mt)
@ -430,12 +439,12 @@ class File(FileBase):
#######################
# ##### Media - audio and video aren't converted ######
def audio(self):
"""Processes an audio file."""
"""Process an audio file."""
self.log_string += 'Audio file'
self._media_processing()
def video(self):
"""Processes a video."""
"""Process a video."""
self.log_string += 'Video file'
self._media_processing()
@ -444,11 +453,14 @@ class File(FileBase):
self.set_property('processing_type', 'media')
def image(self):
"""Processes an image.
"""
Process an image.
Extracts metadata to dest key if metadata is present. Creates a
temporary directory on dest key, opens the using PIL.Image,saves it to
the temporary directory, and copies it to the destination."""
Extracts metadata to dest key using self.extract_metada() if metadata
is present. Creates a temporary directory on dest key, opens the image
using PIL.Image, saves it to the temporary directory, and copies it to
the destination.
"""
# TODO: make sure this method works for png, gif, tiff
if self.has_metadata:
self.extract_metadata()
@ -476,7 +488,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.max_recursive_depth = max_recursive_depth
def process_dir(self, src_dir, dst_dir):
"""Main function coordinating file processing."""
"""Process a directory on the source key."""
self.logger.tree(src_dir)
for srcpath in self.list_all_files(src_dir):
dstpath = srcpath.replace(src_dir, dst_dir)
@ -489,6 +501,12 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.process_file(self.cur_file)
def process_file(self, file):
"""
Process an individual file.
Check the file, handle archives using self.process_archive, copy
the file to the destionation key, and clean up temporary directory.
"""
file.check()
if file.is_recursive:
self.process_archive(file)
@ -500,10 +518,12 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.safe_rmtree(file.tempdir_path)
def process_archive(self, file):
"""Unpacks an archive using 7zip and processes contents.
"""
Unpack an archive using 7zip and process contents using process_dir.
Should be given a Kittengroomer file object whose src_path points
to an archive."""
to an archive.
"""
self.recursive_archive_depth += 1
# LOG: write_log or somehow log the archive file here
if self.recursive_archive_depth >= self.max_recursive_depth:

View File

@ -32,13 +32,18 @@ class ImplementationRequired(KittenGroomerError):
class FileBase(object):
"""
Base object for individual files in the source directory. Contains file
attributes and various helper methods. Subclass and add attributes
or methods relevant to a given implementation.
Base object for individual files in the source directory.
Contains file attributes and various helper methods.
"""
def __init__(self, src_path, dst_path, logger=None):
"""Initialized with the source path and expected destination path."""
"""
Initialized with the source path and expected destination path.
self.logger should be a logging object with an add_file method.
Create various properties and determine the file's mimetype.
"""
self.src_path = src_path
self.dst_path = dst_path
self.filename = os.path.basename(self.src_path)
@ -106,6 +111,7 @@ class FileBase(object):
@property
def size(self):
"""Filesize in bytes as an int, 0 if file does not exist."""
try:
size = os.path.getsize(self.src_path)
except FileNotFoundError:
@ -114,7 +120,7 @@ class FileBase(object):
@property
def has_mimetype(self):
"""Returns True if file has a full mimetype, else False."""
"""True if file has a main and sub mimetype, else False."""
# TODO: broken mimetype checks should be done somewhere else.
# Should the check be by default or should we let the API consumer write it?
if not self.main_type or not self.sub_type:
@ -124,7 +130,7 @@ class FileBase(object):
@property
def has_extension(self):
"""Returns True if self.extension is set, else False."""
"""True if self.extension is set, else False."""
if self.extension is None:
return False
else:
@ -132,35 +138,42 @@ class FileBase(object):
@property
def is_dangerous(self):
"""True if file has been marked 'dangerous' else False."""
"""True if file has been marked 'dangerous', else False."""
return self._file_props['safety_category'] is 'dangerous'
@property
def is_unknown(self):
"""True if file has been marked 'unknown' else False."""
"""True if file has been marked 'unknown', else False."""
return self._file_props['safety_category'] is 'unknown'
@property
def is_binary(self):
"""True if file has been marked 'binary' else False."""
"""True if file has been marked 'binary', else False."""
return self._file_props['safety_category'] is 'binary'
@property
def is_symlink(self):
"""Returns True and updates log if file is a symlink."""
"""True if file is a symlink, else False."""
if self._file_props['symlink'] is False:
return False
else:
return True
def set_property(self, prop_string, value):
"""Takes a property + a value and adds them to self._file_props."""
"""
Take a property and a value and add them to self._file_props.
If prop_string is already in _file_props, set prop_string to value.
If prop_string not in _file_props, set prop_string to value in
_file_props['user_defined'].
"""
if prop_string in self._file_props.keys():
self._file_props[prop_string] = value
else:
self._file_props['user_defined'][prop_string] = value
def get_property(self, file_prop):
"""Get the value for a property in _file_props."""
# TODO: could probably be refactored
if file_prop in self._file_props:
return self._file_props[file_prop]
@ -170,16 +183,18 @@ class FileBase(object):
return None
def add_error(self, error, info):
"""Add an error: info pair to _file_props['errors']."""
self._file_props['errors'].update({error: info})
def add_file_string(self, file_string):
"""Add a file descriptor string to _file_props."""
self._file_props['file_string_set'].add(file_string)
def make_dangerous(self, reason_string=None):
"""
Marks a file as dangerous.
Mark file as dangerous.
Prepends and appends DANGEROUS to the destination file name
Prepend and append DANGEROUS to the destination file name
to help prevent double-click of death.
"""
if self.is_dangerous:
@ -190,7 +205,7 @@ class FileBase(object):
self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename))
def make_unknown(self):
"""Marks a file as an unknown type and prepends UNKNOWN to filename."""
"""Mark file as an unknown type and prepend UNKNOWN to filename."""
if self.is_dangerous or self.is_binary:
return
self.set_property('safety_category', 'unknown')
@ -198,7 +213,7 @@ class FileBase(object):
self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename))
def make_binary(self):
"""Marks a file as a binary and appends .bin to filename."""
"""Mark file as a binary and append .bin to filename."""
if self.is_dangerous:
return
self.set_property('safety_category', 'binary')
@ -206,7 +221,7 @@ class FileBase(object):
self.dst_path = os.path.join(path, '{}.bin'.format(filename))
def safe_copy(self, src=None, dst=None):
"""Copy a file and create directory if needed."""
"""Copy file and create destination directories if needed."""
if src is None:
src = self.src_path
if dst is None:
@ -220,7 +235,7 @@ class FileBase(object):
self.add_error(e, '')
def force_ext(self, ext):
"""If dst_path does not end in ext, changes it and edits _file_props."""
"""If dst_path does not end in ext, change it and edit _file_props."""
if not self.dst_path.endswith(ext):
self.set_property('force_ext', True)
self.dst_path += ext
@ -228,7 +243,7 @@ class FileBase(object):
self.set_property('extension', ext)
def create_metadata_file(self, ext):
"""Create a separate file to hold this file's metadata."""
"""Create a separate file to hold metadata from this file."""
try:
# make sure we aren't overwriting anything
if os.path.exists(self.src_path + ext):
@ -247,7 +262,7 @@ class FileBase(object):
return False
def write_log(self):
"""Print the logs related to the current file being processed."""
"""Write logs from file to self.logger."""
file_log = self.logger.add_file(self)
file_log.fields(**self._file_props)
@ -273,7 +288,7 @@ class GroomerLogger(object):
self.log_debug_out = os.devnull
def tree(self, base_dir, padding=' '):
"""Writes a graphical tree to the log for a given directory."""
"""Write a graphical tree to the log for `base_dir`."""
with open(self.log_content, 'ab') as lf:
lf.write(bytes('#' * 80 + '\n', 'UTF-8'))
lf.write(bytes('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)).encode()), 'utf8'))
@ -289,7 +304,7 @@ class GroomerLogger(object):
lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)).encode(errors='ignore'))
def _computehash(self, path):
"""Returns a sha256 hash of a file at a given path."""
"""Return a sha256 hash of a file at a given path."""
s = hashlib.sha256()
with open(path, 'rb') as f:
while True:
@ -300,6 +315,7 @@ class GroomerLogger(object):
return s.hexdigest()
def add_file(self, file):
"""Add a file to the log."""
return self.log.name('file.src_path')
@ -340,9 +356,7 @@ class KittenGroomerBase(object):
# TODO: feels like this function doesn't need to exist if we move main()
def processdir(self, src_dir, dst_dir):
"""
Implement this function in your subclass to define file processing behavior.
"""
"""Implement this function to define file processing behavior."""
raise ImplementationRequired('Please implement processdir.')