Refactor FileBase to store props as attributes

* This is kind of a big refactoring - I realized that storing file
props in a dict was causing some subtle problems, and that just having
them as attributes makes things a lot more simple
* I considered making a separate FileProps object and nesting it
inside FileBase, but almost all FileBase methods concern manipulating
file props, so it didn't really make sense.
* Tests are almost passing with this commit, but need a few more changes
and fixes for full test coverage + all passing.
pull/16/head
Dan Puttick 2017-07-12 17:58:39 -04:00
parent f424be79cd
commit e19064c83f
4 changed files with 175 additions and 238 deletions

View File

@ -130,7 +130,7 @@ class File(FileBase):
self.logger = logger
self.tempdir_path = self.dst_path + '_temp'
subtypes_apps = [
subtypes_apps = (
(Config.mimes_office, self._winoffice),
(Config.mimes_ooxml, self._ooxml),
(Config.mimes_rtf, self.text),
@ -140,13 +140,13 @@ class File(FileBase):
(Config.mimes_ms, self._executables),
(Config.mimes_compressed, self._archive),
(Config.mimes_data, self._binary_app),
]
)
self.app_subtype_methods = self._make_method_dict(subtypes_apps)
types_metadata = [
types_metadata = (
(Config.mimes_exif, self._metadata_exif),
(Config.mimes_png, self._metadata_png),
]
)
self.metadata_mimetype_methods = self._make_method_dict(types_metadata)
self.mime_processing_options = {
@ -220,7 +220,6 @@ class File(FileBase):
right_to_left_override = u"\u202E"
if right_to_left_override in self.filename:
self.make_dangerous('Filename contains dangerous character')
self.dst_path = self.dst_path.replace(right_to_left_override, '')
self.filename = self.filename.replace(right_to_left_override, '')
self.set_property('filename', self.filename)
@ -230,9 +229,9 @@ class File(FileBase):
Delegates to various helper methods including filetype-specific checks.
"""
if self.main_type in Config.ignored_mimes:
if self.maintype in Config.ignored_mimes:
self.should_copy = False
self.mime_processing_options.get(self.main_type, self.unknown)()
self.mime_processing_options.get(self.maintype, self.unknown)()
else:
self._check_dangerous()
self._check_filename()
@ -241,14 +240,14 @@ class File(FileBase):
if self.has_mimetype:
self._check_mimetype()
if not self.is_dangerous:
self.mime_processing_options.get(self.main_type, self.unknown)()
self.mime_processing_options.get(self.maintype, self.unknown)()
def write_log(self):
"""Pass information about the file to self.logger"""
props = self.get_all_props()
if not self.is_archive:
if os.path.exists(self.tempdir_path):
# Hack to make images appear at the correct tree depth in log
# FIXME: Hack to make images appear at the correct tree depth in log
self.logger.add_file(self.src_path, props, in_tempdir=True)
return
self.logger.add_file(self.src_path, props)
@ -310,13 +309,13 @@ class File(FileBase):
def text(self):
"""Process an rtf, ooxml, or plaintext file."""
for mt in Config.mimes_rtf:
if mt in self.sub_type:
if mt in self.subtype:
self.add_description('Rich Text (rtf) file')
# TODO: need a way to convert it to plain text
self.force_ext('.txt')
return
for mt in Config.mimes_ooxml:
if mt in self.sub_type:
if mt in self.subtype:
self.add_description('OOXML (openoffice) file')
self._ooxml()
return
@ -325,13 +324,12 @@ class File(FileBase):
def application(self):
"""Process an application specific file according to its subtype."""
for subtype, method in self.app_subtype_methods.items():
if subtype in self.sub_type:
# TODO: should we change the logic so we don't iterate through all of the subtype methods?
# TODO: should these methods return a value?
method()
return
self._unknown_app()
if self.subtype in self.app_subtype_methods:
method = self.app_subtype_methods[self.subtype]
method()
# TODO: should these application methods return a value?
else:
self._unknown_app()
def _executables(self):
"""Process an executable file."""
@ -364,7 +362,6 @@ class File(FileBase):
for i in indicators:
if i.id == 'ObjectPool' and i.value:
# TODO: is having an ObjectPool suspicious?
# LOG: user defined property
self.add_description('WinOffice file containing an object pool')
elif i.id == 'flash' and i.value:
self.make_dangerous('WinOffice file with embedded flash')
@ -389,7 +386,7 @@ class File(FileBase):
if len(doc.features.embedded_packages) > 0:
self.make_dangerous('Ooxml file with embedded packages')
if not self.is_dangerous:
self.add_description('OOXML file')
self.add_description('Ooxml file')
def _libreoffice(self):
"""Process a libreoffice file."""
@ -593,10 +590,10 @@ class GroomerLogger(object):
depth = self._get_path_depth(file_path)
description_string = ', '.join(file_props['description_string'])
file_hash = Logging.computehash(file_path)[:6]
if file_props['safety_category'] is None:
description_category = "Normal"
if file_props['is_dangerous']:
description_category = "Dangerous"
else:
description_category = file_props['safety_category'].capitalize()
description_category = "Normal"
size_string = self._format_file_size(file_props['file_size'])
file_template = "+- {name} ({sha_hash}): {size}, type: {mt}/{st}. {desc}: {desc_str}"
file_string = file_template.format(
@ -608,7 +605,7 @@ class GroomerLogger(object):
desc=description_category,
desc_str=description_string,
)
# TODO: work in progress, finish adding Errors and check that they appear properly
# TODO: finish adding Errors and check that they appear properly
# if file_props['errors']:
# error_string = ', '.join([str(key) for key in file_props['errors']])
# file_string.append(' Errors: ' + error_string)

View File

@ -30,82 +30,44 @@ class FileBase(object):
Create various properties and determine the file's mimetype.
"""
self.src_path = src_path
self.dst_path = dst_path
self.filename = os.path.basename(self.src_path)
self._file_props = {
'filepath': self.src_path,
'filename': self.filename,
'file_size': self.size,
'maintype': None,
'subtype': None,
'extension': None,
'safety_category': None,
'symlink': False,
'copied': False,
'description_string': [], # array of descriptions to be joined
'errors': {},
'user_defined': {}
}
self.extension = self._determine_extension()
self.set_property('extension', self.extension)
self.mimetype = self._determine_mimetype()
self.dst_dir = os.path.dirname(dst_path)
self.filename = os.path.basename(src_path)
self.size = self._get_size(src_path)
self.is_dangerous = False # Should this be some other value initially?
self.copied = False
self.symlink_path = None
self.description_string = [] # array of descriptions to be joined
self._errors = {}
self._user_defined = {}
self.should_copy = True
self.main_type = None
self.sub_type = None
if self.mimetype:
self.main_type, self.sub_type = self._split_subtypes(self.mimetype)
if self.main_type:
self.set_property('maintype', self.main_type)
if self.sub_type:
self.set_property('subtype', self.sub_type)
self.mimetype = self._determine_mimetype(src_path)
def _determine_extension(self):
_, ext = os.path.splitext(self.src_path)
@property
def dst_path(self):
return os.path.join(self.dst_dir, self.filename)
@property
def extension(self):
_, ext = os.path.splitext(self.filename)
ext = ext.lower()
if ext == '':
ext = None
return ext
def _determine_mimetype(self):
if os.path.islink(self.src_path):
# libmagic will throw an IOError on a broken symlink
mimetype = 'inode/symlink'
self.set_property('symlink', os.readlink(self.src_path))
else:
try:
mt = magic.from_file(self.src_path, mime=True)
# libmagic will always return something, even if it's just 'data'
except UnicodeEncodeError as e:
# FIXME: The encoding of the file that triggers this is broken (possibly it's UTF-16 and Python expects utf8)
# Note: one of the Travis files will trigger this exception
self.add_error(e, '')
mt = None
try:
mimetype = mt.decode("utf-8")
except:
mimetype = mt
return mimetype
def _split_subtypes(self, mimetype):
if '/' in mimetype:
main_type, sub_type = mimetype.split('/')
else:
main_type, sub_type = None, None
return main_type, sub_type
@property
def maintype(self):
main, _ = self._split_mimetype(self.mimetype)
return main
@property
def size(self):
"""Filesize in bytes as an int, 0 if file does not exist."""
try:
size = os.path.getsize(self.src_path)
except FileNotFoundError:
size = 0
return size
def subtype(self):
_, sub = self._split_mimetype(self.mimetype)
return sub
@property
def has_mimetype(self):
"""True if file has a main and sub mimetype, else False."""
if not self.main_type or not self.sub_type:
if not self.maintype or not self.subtype:
return False
else:
return True
@ -118,43 +80,30 @@ class FileBase(object):
else:
return True
@property
def is_dangerous(self):
"""True if file has been marked 'dangerous', else False."""
return self._file_props['safety_category'] is 'dangerous'
@property
def is_unknown(self):
"""True if file has been marked 'unknown', else False."""
return self._file_props['safety_category'] is 'unknown'
@property
def is_binary(self):
"""True if file has been marked 'binary', else False."""
return self._file_props['safety_category'] is 'binary'
@property
def is_symlink(self):
"""True if file is a symlink, else False."""
if self._file_props['symlink'] is False:
if self.symlink_path is None:
return False
else:
return True
def set_property(self, prop_string, value):
"""
Take a property and a value and add them to the file's property dict.
Take a property and a value and add them to the file's stored props.
If `prop_string` is part of the file property API, set it to `value`.
Otherwise, add `prop_string`: `value` to `user_defined` properties.
TODO: rewrite docstring
"""
if prop_string is 'description_string':
if value not in self._file_props['description_string']:
self._file_props['description_string'].append(value)
elif prop_string in self._file_props.keys():
self._file_props[prop_string] = value
if hasattr(self, prop_string):
setattr(self, prop_string, value)
else:
self._file_props['user_defined'][prop_string] = value
if prop_string is 'description_string':
if value not in self.description_string:
self.description_string.append(value)
else:
self._user_defined[prop_string] = value
def get_property(self, prop_string):
"""
@ -162,20 +111,34 @@ class FileBase(object):
Returns `None` if `prop_string` cannot be found on the file.
"""
if prop_string in self._file_props:
return self._file_props[prop_string]
elif prop_string in self._file_props['user_defined']:
return self._file_props['user_defined'][prop_string]
else:
return None
try:
return getattr(self, prop_string)
except AttributeError:
return self._user_defined.get(prop_string, None)
def get_all_props(self):
"""Return a dict containing all stored properties of this file."""
return self._file_props
# Maybe move this onto the logger? I think that makes more sense
props_dict = {
'filepath': self.src_path,
'filename': self.filename,
'file_size': self.size,
'mimetype': self.mimetype,
'maintype': self.maintype,
'subtype': self.subtype,
'extension': self.extension,
'is_dangerous': self.is_dangerous,
'symlink_path': self.symlink_path,
'copied': self.copied,
'description_string': self.description_string,
'errors': self._errors,
'user_defined': self._user_defined
}
return props_dict
def add_error(self, error, info_string):
"""Add an `error`: `info_string` pair to the file."""
self._file_props['errors'].update({error: info_string})
self._errors.update({error: info_string})
def add_description(self, description_string):
"""
@ -192,29 +155,10 @@ class FileBase(object):
Prepend and append DANGEROUS to the destination file name
to help prevent double-click of death.
"""
if self.is_dangerous:
self.set_property('description_string', reason_string)
return
self.set_property('safety_category', 'dangerous')
self.set_property('description_string', reason_string)
path, filename = os.path.split(self.dst_path)
self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename))
def make_unknown(self):
"""Mark file as an unknown type and prepend UNKNOWN to filename."""
if self.is_dangerous or self.is_binary:
return
self.set_property('safety_category', 'unknown')
path, filename = os.path.split(self.dst_path)
self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename))
def make_binary(self):
"""Mark file as a binary and append .bin to filename."""
if self.is_dangerous:
return
self.set_property('safety_category', 'binary')
path, filename = os.path.split(self.dst_path)
self.dst_path = os.path.join(path, '{}.bin'.format(filename))
if not self.is_dangerous:
self.set_property('dangerous', True)
self.add_description(reason_string)
self.filename = 'DANGEROUS_{}_DANGEROUS'.format(self.filename)
def safe_copy(self, src=None, dst=None):
"""Copy file and create destination directories if needed."""
@ -223,52 +167,91 @@ class FileBase(object):
if dst is None:
dst = self.dst_path
try:
dst_path, filename = os.path.split(dst)
if not os.path.exists(dst_path):
os.makedirs(dst_path)
if not os.path.exists(self.dst_dir):
os.makedirs(self.dst_dir)
shutil.copy(src, dst)
except Exception as e:
# TODO: what exceptions are we expecting to catch here?
self.add_error(e, '')
def force_ext(self, ext):
"""If dst_path does not end in ext, append .ext to it."""
ext = self._check_leading_dot(ext)
if not self.dst_path.endswith(ext):
# LOG: do we want to log that the extension was changed as below?
# self.set_property('force_ext', True)
self.dst_path += ext
if not self._file_props['extension'] == ext:
self.set_property('extension', ext)
def force_ext(self, extension):
"""If dst_path does not end in `extension`, append .ext to it."""
new_ext = self._check_leading_dot(extension)
if not self.filename.endswith(new_ext):
# TODO: do we want to log that the extension was changed?
self.filename += new_ext
if not self.get_property('extension') == new_ext:
self.set_property('extension', new_ext)
def create_metadata_file(self, ext):
def create_metadata_file(self, extension):
# TODO: this method name is confusing
"""
Create a separate file to hold extracted metadata.
The string `ext` will be used as the extension for the metadata file.
The string `extension` will be used as the extension for the file.
"""
ext = self._check_leading_dot(ext)
ext = self._check_leading_dot(extension)
try:
# Prevent using the same path as another file from src_path
if os.path.exists(self.src_path + ext):
err_str = ("Could not create metadata file for \"" +
self.filename +
"\": a file with that path already exists.")
raise KittenGroomerError(err_str)
raise KittenGroomerError(
"Could not create metadata file for \"" +
self.filename +
"\": a file with that path exists.")
else:
dst_dir_path, filename = os.path.split(self.dst_path)
if not os.path.exists(dst_dir_path):
os.makedirs(dst_dir_path)
if not os.path.exists(self.dst_dir):
os.makedirs(self.dst_dir)
# TODO: shouldn't mutate state and also return something
self.metadata_file_path = self.dst_path + ext
return self.metadata_file_path
# TODO: can probably let this exception bubble up
except KittenGroomerError as e:
self.add_error(e, '')
return False
def _check_leading_dot(self, ext):
# TODO: this method name is confusing
if len(ext) > 0:
if not ext.startswith('.'):
return '.' + ext
return ext
def _determine_mimetype(self, file_path):
if os.path.islink(file_path):
# libmagic will throw an IOError on a broken symlink
mimetype = 'inode/symlink'
self.set_property('symlink', os.readlink(file_path))
else:
try:
mt = magic.from_file(file_path, mime=True)
# libmagic will always return something, even if it's just 'data'
except UnicodeEncodeError as e:
# FIXME: The encoding of the file that triggers this is broken (possibly it's UTF-16 and Python expects utf8)
# Note: one of the Travis files will trigger this exception
self.add_error(e, '')
mt = None
try:
mimetype = mt.decode("utf-8")
except:
# FIXME: what should the exception be here if mimetype isn't utf-8?
mimetype = mt
return mimetype
def _split_mimetype(self, mimetype):
if '/' in mimetype:
main_type, sub_type = mimetype.split('/')
else:
main_type, sub_type = None, None
return main_type, sub_type
def _get_size(self, file_path):
"""Filesize in bytes as an int, 0 if file does not exist."""
try:
size = os.path.getsize(file_path)
except FileNotFoundError:
size = 0
return size
class Logging(object):

View File

@ -113,7 +113,7 @@ def logger(dest_dir_path):
def test_sample_files(sample_file, groomer, logger, dest_dir_path):
if sample_file.xfail:
pytest.xfail(reason='Marked xfail in file catalog')
file_dest_path = dest_dir_path + sample_file.filename
file_dest_path = os.path.join(dest_dir_path, sample_file.filename)
file = File(sample_file.path, file_dest_path, logger)
groomer.process_file(file)
assert file.is_dangerous == sample_file.exp_dangerous

View File

@ -12,7 +12,6 @@ xfail = pytest.mark.xfail
fixture = pytest.fixture
@skip
class TestFileBase:
@fixture
@ -40,28 +39,17 @@ class TestFileBase:
return FileBase(file_path, file_path)
@fixture
def file_marked_dangerous(self, generic_conf_file):
generic_conf_file.make_dangerous()
return generic_conf_file
def file_marked_dangerous(self):
pass
@fixture
def file_marked_unknown(self, generic_conf_file):
generic_conf_file.make_unknown()
return generic_conf_file
@fixture
def file_marked_binary(self, generic_conf_file):
generic_conf_file.make_binary()
return generic_conf_file
@fixture(params=[
FileBase.make_dangerous,
FileBase.make_unknown,
FileBase.make_binary
])
def file_marked_all_parameterized(self, request, generic_conf_file):
request.param(generic_conf_file)
return generic_conf_file
# @fixture(params=[
# FileBase.make_dangerous,
# FileBase.make_unknown,
# FileBase.make_binary
# ])
# def file_marked_all_parameterized(self, request, generic_conf_file):
# request.param(generic_conf_file)
# return generic_conf_file
def test_init_identify_filename(self):
"""Init should identify the filename correctly for src_path."""
@ -91,9 +79,6 @@ class TestFileBase:
FileBase(tmpdir.strpath, tmpdir.strpath)
# TODO: are there other cases here? path to a file that doesn't exist? permissions?
def test_init(self, generic_conf_file):
generic_conf_file
def test_extension_uppercase(self, tmpdir):
file_path = tmpdir.join('TEST.TXT')
file_path.write('testing')
@ -101,19 +86,21 @@ class TestFileBase:
file = FileBase(file_path, file_path)
assert file.extension == '.txt'
def test_mimetypes(self, generic_conf_file):
assert generic_conf_file.mimetype == 'text/plain'
assert generic_conf_file.main_type == 'text'
assert generic_conf_file.sub_type == 'plain'
assert generic_conf_file.has_mimetype
def test_mimetypes(self, temp_file):
assert temp_file.mimetype == 'text/plain'
assert temp_file.maintype == 'text'
assert temp_file.subtype == 'plain'
assert temp_file.has_mimetype
# Need to test something without a mimetype
# Need to test something that's a directory
# Need to test something that causes the unicode exception
@skip
def test_has_mimetype_no_main_type(self, generic_conf_file):
generic_conf_file.main_type = ''
generic_conf_file.maintype = ''
assert generic_conf_file.has_mimetype is False
@skip
def test_has_mimetype_no_sub_type(self, generic_conf_file):
generic_conf_file.sub_type = ''
assert generic_conf_file.has_mimetype is False
@ -123,22 +110,20 @@ class TestFileBase:
print(temp_file_no_ext.extension)
assert temp_file_no_ext.has_extension is False
def test_set_property(self, generic_conf_file):
generic_conf_file.set_property('test', True)
assert generic_conf_file.get_property('test') is True
assert generic_conf_file.get_property('wrong') is None
def test_set_property(self, temp_file):
temp_file.set_property('test', True)
assert temp_file.get_property('test') is True
# TODO: split asserts into two tests
assert temp_file.get_property('wrong') is None
@skip
def test_marked_dangerous(self, file_marked_all_parameterized):
file_marked_all_parameterized.make_dangerous()
assert file_marked_all_parameterized.is_dangerous is True
# Should work regardless of weird paths??
# Should check file path alteration behavior as well
def test_generic_dangerous(self, generic_conf_file):
assert generic_conf_file.is_dangerous is False
generic_conf_file.make_dangerous()
assert generic_conf_file.is_dangerous is True
@xfail
def test_has_symlink(self, tmpdir):
file_path = tmpdir.join('test.txt')
file_path.write('testing')
@ -151,40 +136,11 @@ class TestFileBase:
assert file.is_symlink is False
assert symlink.is_symlink is True
@xfail
def test_has_symlink_fixture(self, symlink_file):
assert symlink_file.is_symlink is True
def test_generic_make_unknown(self, generic_conf_file):
assert generic_conf_file.is_unknown is False
generic_conf_file.make_unknown()
assert generic_conf_file.is_unknown
# given a FileBase object with no marking, should do the right things
def test_marked_make_unknown(self, file_marked_all_parameterized):
file = file_marked_all_parameterized
if file.is_unknown:
file.make_unknown()
assert file.is_unknown
else:
assert file.is_unknown is False
file.make_unknown()
assert file.is_unknown is False
# given a FileBase object with an unrecognized marking, should ???
def test_generic_make_binary(self, generic_conf_file):
assert generic_conf_file.is_binary is False
generic_conf_file.make_binary()
assert generic_conf_file.is_binary
def test_marked_make_binary(self, file_marked_all_parameterized):
file = file_marked_all_parameterized
if file.is_dangerous:
file.make_binary()
assert file.is_binary is False
else:
file.make_binary()
assert file.is_binary
@skip
def test_force_ext_change(self, generic_conf_file):
assert generic_conf_file.has_extension
assert generic_conf_file.get_property('extension') == '.conf'
@ -194,6 +150,7 @@ class TestFileBase:
assert generic_conf_file.get_property('extension') == '.txt'
# should be able to handle weird paths
@skip
def test_force_ext_correct(self, generic_conf_file):
assert generic_conf_file.has_extension
assert generic_conf_file.get_property('extension') == '.conf'
@ -202,6 +159,7 @@ class TestFileBase:
assert generic_conf_file.get_property('force_ext') is None
# shouldn't change a file's extension if it already is right
@xfail
def test_create_metadata_file(self, temp_file):
metadata_file_path = temp_file.create_metadata_file('.metadata.txt')
with open(metadata_file_path, 'w+') as metadata_file:
@ -211,9 +169,8 @@ class TestFileBase:
# if metadata file already exists
# if there is no metadata to write should this work?
def test_safe_copy(self, generic_conf_file):
generic_conf_file.safe_copy()
# check that safe copy can handle weird file path inputs
def test_safe_copy(self):
pass
@skip