Add TODOs and clarify various logging messages

pull/12/head
Dan Puttick 2017-03-14 10:41:31 -04:00
parent 963a2feef4
commit 0175ee48e5
2 changed files with 33 additions and 44 deletions

View File

@ -143,7 +143,7 @@ class File(FileBase):
is_known_extension = self.extension in mimetypes.types_map.keys()
if is_known_extension and expected_mimetype != self.mimetype:
# LOG: improve this string
self.make_dangerous('expected_mimetyped')
self.make_dangerous('expected_mimetype')
def _check_mimetype(self):
"""Takes the mimetype (as determined by libmagic) and determines
@ -249,7 +249,8 @@ class File(FileBase):
"""Processes an application specific file according to its subtype."""
for subtype, method in self.app_subtype_methods.items():
if subtype in self.sub_type:
# TODO: should this return a value?
# TODO: should we change the logic so we don't iterate through all of the subtype methods?
# TODO: should these methods return a value?
method()
self.add_file_string('Application file')
return
@ -258,16 +259,15 @@ class File(FileBase):
def _executables(self):
"""Processes an executable file."""
# LOG: change this property
# LOG: change the processing_type property to some other name or include in file_string
self.set_property('processing_type', 'executable')
self.make_dangerous('executable')
def _winoffice(self):
"""Processes a winoffice file using olefile/oletools."""
# LOG: change this property
# LOG: processing_type property
self.set_property('processing_type', 'WinOffice')
# Try as if it is a valid document
oid = oletools.oleid.OleID(self.src_path)
oid = oletools.oleid.OleID(self.src_path) # First assume a valid file
if not olefile.isOleFile(self.src_path):
# Manual processing, may already count as suspicious
try:
@ -291,12 +291,14 @@ class File(FileBase):
for i in indicators:
if i.id == 'ObjectPool' and i.value:
# TODO: Is it suspicious?
# LOG: user defined property
self.set_property('objpool', True)
elif i.id == 'flash' and i.value:
self.make_dangerous('flash')
def _ooxml(self):
"""Processes an ooxml file."""
# LOG: processing_type property
self.set_property('processing_type', 'ooxml')
try:
doc = officedissector.doc.Document(self.src_path)
@ -318,7 +320,7 @@ class File(FileBase):
def _libreoffice(self):
"""Processes a libreoffice file."""
self.set_property('processing_type', 'libreoffice')
# As long as there ar no way to do a sanity check on the files => dangerous
# As long as there is no way to do a sanity check on the files => dangerous
try:
lodoc = zipfile.ZipFile(self.src_path, 'r')
except:
@ -332,6 +334,7 @@ class File(FileBase):
def _pdf(self):
"""Processes a PDF file."""
# LOG: processing_type property
self.set_property('processing_type', 'pdf')
xmlDoc = PDFiD(self.src_path)
oPDFiD = cPDFiD(xmlDoc, True)
@ -352,9 +355,10 @@ class File(FileBase):
temporary directory and self.process_dir is called on that directory.
The recursive archive depth is increased to protect against archive
bombs."""
# LOG: change this to something archive specific
self.set_property('processing_type', 'archive')
self.should_copy = False
self.is_recursive = True
# self.log_string += 'Archive extracted, processing content.'
def _unknown_app(self):
"""Processes an unknown file."""
@ -367,10 +371,9 @@ class File(FileBase):
#######################
# Metadata extractors
def _metadata_exif(self, metadata_file_path):
# TODO: this method is kind of long, can we shorten it?
# TODO: this method is kind of long, can we shorten it somehow?
img = open(self.src_path, 'rb')
tags = None
try:
tags = exifread.process_file(img, debug=True)
except Exception as e:
@ -382,19 +385,17 @@ class File(FileBase):
self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path))
img.close()
return False
for tag in sorted(tags.keys()):
# These are long and obnoxious/binary
# These tags are long and obnoxious/binary so we don't add them
if tag not in ('JPEGThumbnail', 'TIFFThumbnail'):
printable = str(tags[tag])
tag_string = str(tags[tag])
# Exifreader truncates data.
if len(printable) > 25 and printable.endswith(", ... ]"):
value = tags[tag].values
printable = str(value)
if len(tag_string) > 25 and tag_string.endswith(", ... ]"):
tag_value = tags[tag].values
tag_string = str(tag_value)
with open(metadata_file_path, 'w+') as metadata_file:
metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable))
metadata_file.write("Key: {}\tValue: {}\n".format(tag, tag_string))
# LOG: how do we want to log metadata?
self.set_property('metadata', 'exif')
img.close()
return True
@ -408,10 +409,12 @@ class File(FileBase):
if tag not in ('icc_profile'):
with open(metadata_file_path, 'w+') as metadata_file:
metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag]))
# LOG: handle metadata
self.set_property('metadata', 'png')
img.close()
# Catch decompression bombs
except Exception as e:
# TODO: only catch DecompressionBombWarnings here?
self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path))
self.make_dangerous('exception processing metadata')
return False
@ -474,11 +477,6 @@ class KittenGroomerFileCheck(KittenGroomerBase):
def process_dir(self, src_dir, dst_dir):
"""Main function coordinating file processing."""
# LOG: what's the purpose of this write log?:
# if self.recursive_archive_depth > 0:
# self.write_log()
# We're writing the log here because...
# How exactly does the workflow work with an archive?
for srcpath in self.list_all_files(src_dir):
dstpath = srcpath.replace(src_dir, dst_dir)
# TODO: Can we clean up the way we handle relative_path?
@ -506,9 +504,9 @@ class KittenGroomerFileCheck(KittenGroomerBase):
Should be given a Kittengroomer file object whose src_path points
to an archive."""
self.recursive_archive_depth += 1
# Check for archivebomb
# LOG: write_log or somehow log the archive file here
if self.recursive_archive_depth >= self.max_recursive_depth:
self._handle_archivebomb(file)
file.make_dangerous('Archive bomb')
else:
tempdir_path = file.make_tempdir()
command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
@ -521,13 +519,6 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.safe_rmtree(tempdir_path)
self.recursive_archive_depth -= 1
def _handle_archivebomb(self, file):
file.make_dangerous('Archive bomb')
self.logger.log.warning('ARCHIVE BOMB.')
self.logger.log.warning('The content of the archive contains recursively other archives.')
self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.')
# TODO: delete whatever we want to delete that's already been copied to dest dir
def _run_process(self, command_string, timeout=None):
"""Run command_string in a subprocess, wait until it finishes."""
args = shlex.split(command_string)

View File

@ -88,7 +88,7 @@ class FileBase(object):
# Note: magic will always return something, even if it's just 'data'
except UnicodeEncodeError as e:
# FIXME: The encoding of the file is broken (possibly UTF-16)
# One of the Travis files will trigger this.
# Note: one of the Travis files will trigger this exception
self.add_error(e, '')
mt = None
try:
@ -115,9 +115,9 @@ class FileBase(object):
@property
def has_mimetype(self):
"""Returns True if file has a full mimetype, else False."""
# TODO: broken mimetype checks should be done somewhere else.
# Should the check be by default or should we let the API consumer write it?
if not self.main_type or not self.sub_type:
# LOG: log this as an error, move somewhere else?
# self._file_props.update({'broken_mime': True})
return False
else:
return True
@ -126,8 +126,6 @@ class FileBase(object):
def has_extension(self):
"""Returns True if self.extension is set, else False."""
if self.extension is None:
# TODO: do we actually want to have a seperate prop for no extension?
# self.set_property('no_extension', True)
return False
else:
return True
@ -163,7 +161,7 @@ class FileBase(object):
self._file_props['user_defined'][prop_string] = value
def get_property(self, file_prop):
# FIXME: needs to be refactored
# TODO: could probably be refactored
if file_prop in self._file_props:
return self._file_props[file_prop]
elif file_prop in self._file_props['user_defined']:
@ -187,7 +185,7 @@ class FileBase(object):
if self.is_dangerous:
return
self.set_property('safety_category', 'dangerous')
# LOG: store reason string
# LOG: store reason string somewhere and do something with it
path, filename = os.path.split(self.dst_path)
self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename))
@ -250,11 +248,12 @@ class FileBase(object):
def write_log(self):
"""Print the logs related to the current file being processed."""
tmp_log = self.logger.log.fields(**self._file_props)
file_log = self.logger.add_file(self)
file_log.fields(**self._file_props)
class GroomerLogger(object):
"""Groomer logging interface"""
"""Groomer logging interface."""
def __init__(self, root_dir_path, debug=False):
self.root_dir = root_dir_path
@ -301,8 +300,7 @@ class GroomerLogger(object):
return s.hexdigest()
def add_file(self, file):
# return a sublog for the file
pass
return self.log.name('file.src_path')
class KittenGroomerBase(object):