mirror of https://github.com/CIRCL/PyCIRCLean
Add TODOs and clarify various logging messages
parent
963a2feef4
commit
0175ee48e5
|
@ -143,7 +143,7 @@ class File(FileBase):
|
||||||
is_known_extension = self.extension in mimetypes.types_map.keys()
|
is_known_extension = self.extension in mimetypes.types_map.keys()
|
||||||
if is_known_extension and expected_mimetype != self.mimetype:
|
if is_known_extension and expected_mimetype != self.mimetype:
|
||||||
# LOG: improve this string
|
# LOG: improve this string
|
||||||
self.make_dangerous('expected_mimetyped')
|
self.make_dangerous('expected_mimetype')
|
||||||
|
|
||||||
def _check_mimetype(self):
|
def _check_mimetype(self):
|
||||||
"""Takes the mimetype (as determined by libmagic) and determines
|
"""Takes the mimetype (as determined by libmagic) and determines
|
||||||
|
@ -249,7 +249,8 @@ class File(FileBase):
|
||||||
"""Processes an application specific file according to its subtype."""
|
"""Processes an application specific file according to its subtype."""
|
||||||
for subtype, method in self.app_subtype_methods.items():
|
for subtype, method in self.app_subtype_methods.items():
|
||||||
if subtype in self.sub_type:
|
if subtype in self.sub_type:
|
||||||
# TODO: should this return a value?
|
# TODO: should we change the logic so we don't iterate through all of the subtype methods?
|
||||||
|
# TODO: should these methods return a value?
|
||||||
method()
|
method()
|
||||||
self.add_file_string('Application file')
|
self.add_file_string('Application file')
|
||||||
return
|
return
|
||||||
|
@ -258,16 +259,15 @@ class File(FileBase):
|
||||||
|
|
||||||
def _executables(self):
|
def _executables(self):
|
||||||
"""Processes an executable file."""
|
"""Processes an executable file."""
|
||||||
# LOG: change this property
|
# LOG: change the processing_type property to some other name or include in file_string
|
||||||
self.set_property('processing_type', 'executable')
|
self.set_property('processing_type', 'executable')
|
||||||
self.make_dangerous('executable')
|
self.make_dangerous('executable')
|
||||||
|
|
||||||
def _winoffice(self):
|
def _winoffice(self):
|
||||||
"""Processes a winoffice file using olefile/oletools."""
|
"""Processes a winoffice file using olefile/oletools."""
|
||||||
# LOG: change this property
|
# LOG: processing_type property
|
||||||
self.set_property('processing_type', 'WinOffice')
|
self.set_property('processing_type', 'WinOffice')
|
||||||
# Try as if it is a valid document
|
oid = oletools.oleid.OleID(self.src_path) # First assume a valid file
|
||||||
oid = oletools.oleid.OleID(self.src_path)
|
|
||||||
if not olefile.isOleFile(self.src_path):
|
if not olefile.isOleFile(self.src_path):
|
||||||
# Manual processing, may already count as suspicious
|
# Manual processing, may already count as suspicious
|
||||||
try:
|
try:
|
||||||
|
@ -291,12 +291,14 @@ class File(FileBase):
|
||||||
for i in indicators:
|
for i in indicators:
|
||||||
if i.id == 'ObjectPool' and i.value:
|
if i.id == 'ObjectPool' and i.value:
|
||||||
# TODO: Is it suspicious?
|
# TODO: Is it suspicious?
|
||||||
|
# LOG: user defined property
|
||||||
self.set_property('objpool', True)
|
self.set_property('objpool', True)
|
||||||
elif i.id == 'flash' and i.value:
|
elif i.id == 'flash' and i.value:
|
||||||
self.make_dangerous('flash')
|
self.make_dangerous('flash')
|
||||||
|
|
||||||
def _ooxml(self):
|
def _ooxml(self):
|
||||||
"""Processes an ooxml file."""
|
"""Processes an ooxml file."""
|
||||||
|
# LOG: processing_type property
|
||||||
self.set_property('processing_type', 'ooxml')
|
self.set_property('processing_type', 'ooxml')
|
||||||
try:
|
try:
|
||||||
doc = officedissector.doc.Document(self.src_path)
|
doc = officedissector.doc.Document(self.src_path)
|
||||||
|
@ -318,7 +320,7 @@ class File(FileBase):
|
||||||
def _libreoffice(self):
|
def _libreoffice(self):
|
||||||
"""Processes a libreoffice file."""
|
"""Processes a libreoffice file."""
|
||||||
self.set_property('processing_type', 'libreoffice')
|
self.set_property('processing_type', 'libreoffice')
|
||||||
# As long as there ar no way to do a sanity check on the files => dangerous
|
# As long as there is no way to do a sanity check on the files => dangerous
|
||||||
try:
|
try:
|
||||||
lodoc = zipfile.ZipFile(self.src_path, 'r')
|
lodoc = zipfile.ZipFile(self.src_path, 'r')
|
||||||
except:
|
except:
|
||||||
|
@ -332,6 +334,7 @@ class File(FileBase):
|
||||||
|
|
||||||
def _pdf(self):
|
def _pdf(self):
|
||||||
"""Processes a PDF file."""
|
"""Processes a PDF file."""
|
||||||
|
# LOG: processing_type property
|
||||||
self.set_property('processing_type', 'pdf')
|
self.set_property('processing_type', 'pdf')
|
||||||
xmlDoc = PDFiD(self.src_path)
|
xmlDoc = PDFiD(self.src_path)
|
||||||
oPDFiD = cPDFiD(xmlDoc, True)
|
oPDFiD = cPDFiD(xmlDoc, True)
|
||||||
|
@ -352,9 +355,10 @@ class File(FileBase):
|
||||||
temporary directory and self.process_dir is called on that directory.
|
temporary directory and self.process_dir is called on that directory.
|
||||||
The recursive archive depth is increased to protect against archive
|
The recursive archive depth is increased to protect against archive
|
||||||
bombs."""
|
bombs."""
|
||||||
|
# LOG: change this to something archive specific
|
||||||
self.set_property('processing_type', 'archive')
|
self.set_property('processing_type', 'archive')
|
||||||
|
self.should_copy = False
|
||||||
self.is_recursive = True
|
self.is_recursive = True
|
||||||
# self.log_string += 'Archive extracted, processing content.'
|
|
||||||
|
|
||||||
def _unknown_app(self):
|
def _unknown_app(self):
|
||||||
"""Processes an unknown file."""
|
"""Processes an unknown file."""
|
||||||
|
@ -367,10 +371,9 @@ class File(FileBase):
|
||||||
#######################
|
#######################
|
||||||
# Metadata extractors
|
# Metadata extractors
|
||||||
def _metadata_exif(self, metadata_file_path):
|
def _metadata_exif(self, metadata_file_path):
|
||||||
# TODO: this method is kind of long, can we shorten it?
|
# TODO: this method is kind of long, can we shorten it somehow?
|
||||||
img = open(self.src_path, 'rb')
|
img = open(self.src_path, 'rb')
|
||||||
tags = None
|
tags = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tags = exifread.process_file(img, debug=True)
|
tags = exifread.process_file(img, debug=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -382,19 +385,17 @@ class File(FileBase):
|
||||||
self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path))
|
self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path))
|
||||||
img.close()
|
img.close()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for tag in sorted(tags.keys()):
|
for tag in sorted(tags.keys()):
|
||||||
# These are long and obnoxious/binary
|
# These tags are long and obnoxious/binary so we don't add them
|
||||||
if tag not in ('JPEGThumbnail', 'TIFFThumbnail'):
|
if tag not in ('JPEGThumbnail', 'TIFFThumbnail'):
|
||||||
printable = str(tags[tag])
|
tag_string = str(tags[tag])
|
||||||
|
|
||||||
# Exifreader truncates data.
|
# Exifreader truncates data.
|
||||||
if len(printable) > 25 and printable.endswith(", ... ]"):
|
if len(tag_string) > 25 and tag_string.endswith(", ... ]"):
|
||||||
value = tags[tag].values
|
tag_value = tags[tag].values
|
||||||
printable = str(value)
|
tag_string = str(tag_value)
|
||||||
|
|
||||||
with open(metadata_file_path, 'w+') as metadata_file:
|
with open(metadata_file_path, 'w+') as metadata_file:
|
||||||
metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable))
|
metadata_file.write("Key: {}\tValue: {}\n".format(tag, tag_string))
|
||||||
|
# LOG: how do we want to log metadata?
|
||||||
self.set_property('metadata', 'exif')
|
self.set_property('metadata', 'exif')
|
||||||
img.close()
|
img.close()
|
||||||
return True
|
return True
|
||||||
|
@ -408,10 +409,12 @@ class File(FileBase):
|
||||||
if tag not in ('icc_profile'):
|
if tag not in ('icc_profile'):
|
||||||
with open(metadata_file_path, 'w+') as metadata_file:
|
with open(metadata_file_path, 'w+') as metadata_file:
|
||||||
metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag]))
|
metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag]))
|
||||||
|
# LOG: handle metadata
|
||||||
self.set_property('metadata', 'png')
|
self.set_property('metadata', 'png')
|
||||||
img.close()
|
img.close()
|
||||||
# Catch decompression bombs
|
# Catch decompression bombs
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# TODO: only catch DecompressionBombWarnings here?
|
||||||
self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path))
|
self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path))
|
||||||
self.make_dangerous('exception processing metadata')
|
self.make_dangerous('exception processing metadata')
|
||||||
return False
|
return False
|
||||||
|
@ -474,11 +477,6 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
|
|
||||||
def process_dir(self, src_dir, dst_dir):
|
def process_dir(self, src_dir, dst_dir):
|
||||||
"""Main function coordinating file processing."""
|
"""Main function coordinating file processing."""
|
||||||
# LOG: what's the purpose of this write log?:
|
|
||||||
# if self.recursive_archive_depth > 0:
|
|
||||||
# self.write_log()
|
|
||||||
# We're writing the log here because...
|
|
||||||
# How exactly does the workflow work with an archive?
|
|
||||||
for srcpath in self.list_all_files(src_dir):
|
for srcpath in self.list_all_files(src_dir):
|
||||||
dstpath = srcpath.replace(src_dir, dst_dir)
|
dstpath = srcpath.replace(src_dir, dst_dir)
|
||||||
# TODO: Can we clean up the way we handle relative_path?
|
# TODO: Can we clean up the way we handle relative_path?
|
||||||
|
@ -506,9 +504,9 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
Should be given a Kittengroomer file object whose src_path points
|
Should be given a Kittengroomer file object whose src_path points
|
||||||
to an archive."""
|
to an archive."""
|
||||||
self.recursive_archive_depth += 1
|
self.recursive_archive_depth += 1
|
||||||
# Check for archivebomb
|
# LOG: write_log or somehow log the archive file here
|
||||||
if self.recursive_archive_depth >= self.max_recursive_depth:
|
if self.recursive_archive_depth >= self.max_recursive_depth:
|
||||||
self._handle_archivebomb(file)
|
file.make_dangerous('Archive bomb')
|
||||||
else:
|
else:
|
||||||
tempdir_path = file.make_tempdir()
|
tempdir_path = file.make_tempdir()
|
||||||
command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
|
command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
|
||||||
|
@ -521,13 +519,6 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
self.safe_rmtree(tempdir_path)
|
self.safe_rmtree(tempdir_path)
|
||||||
self.recursive_archive_depth -= 1
|
self.recursive_archive_depth -= 1
|
||||||
|
|
||||||
def _handle_archivebomb(self, file):
|
|
||||||
file.make_dangerous('Archive bomb')
|
|
||||||
self.logger.log.warning('ARCHIVE BOMB.')
|
|
||||||
self.logger.log.warning('The content of the archive contains recursively other archives.')
|
|
||||||
self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.')
|
|
||||||
# TODO: delete whatever we want to delete that's already been copied to dest dir
|
|
||||||
|
|
||||||
def _run_process(self, command_string, timeout=None):
|
def _run_process(self, command_string, timeout=None):
|
||||||
"""Run command_string in a subprocess, wait until it finishes."""
|
"""Run command_string in a subprocess, wait until it finishes."""
|
||||||
args = shlex.split(command_string)
|
args = shlex.split(command_string)
|
||||||
|
|
|
@ -88,7 +88,7 @@ class FileBase(object):
|
||||||
# Note: magic will always return something, even if it's just 'data'
|
# Note: magic will always return something, even if it's just 'data'
|
||||||
except UnicodeEncodeError as e:
|
except UnicodeEncodeError as e:
|
||||||
# FIXME: The encoding of the file is broken (possibly UTF-16)
|
# FIXME: The encoding of the file is broken (possibly UTF-16)
|
||||||
# One of the Travis files will trigger this.
|
# Note: one of the Travis files will trigger this exception
|
||||||
self.add_error(e, '')
|
self.add_error(e, '')
|
||||||
mt = None
|
mt = None
|
||||||
try:
|
try:
|
||||||
|
@ -115,9 +115,9 @@ class FileBase(object):
|
||||||
@property
|
@property
|
||||||
def has_mimetype(self):
|
def has_mimetype(self):
|
||||||
"""Returns True if file has a full mimetype, else False."""
|
"""Returns True if file has a full mimetype, else False."""
|
||||||
|
# TODO: broken mimetype checks should be done somewhere else.
|
||||||
|
# Should the check be by default or should we let the API consumer write it?
|
||||||
if not self.main_type or not self.sub_type:
|
if not self.main_type or not self.sub_type:
|
||||||
# LOG: log this as an error, move somewhere else?
|
|
||||||
# self._file_props.update({'broken_mime': True})
|
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
@ -126,8 +126,6 @@ class FileBase(object):
|
||||||
def has_extension(self):
|
def has_extension(self):
|
||||||
"""Returns True if self.extension is set, else False."""
|
"""Returns True if self.extension is set, else False."""
|
||||||
if self.extension is None:
|
if self.extension is None:
|
||||||
# TODO: do we actually want to have a seperate prop for no extension?
|
|
||||||
# self.set_property('no_extension', True)
|
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
@ -163,7 +161,7 @@ class FileBase(object):
|
||||||
self._file_props['user_defined'][prop_string] = value
|
self._file_props['user_defined'][prop_string] = value
|
||||||
|
|
||||||
def get_property(self, file_prop):
|
def get_property(self, file_prop):
|
||||||
# FIXME: needs to be refactored
|
# TODO: could probably be refactored
|
||||||
if file_prop in self._file_props:
|
if file_prop in self._file_props:
|
||||||
return self._file_props[file_prop]
|
return self._file_props[file_prop]
|
||||||
elif file_prop in self._file_props['user_defined']:
|
elif file_prop in self._file_props['user_defined']:
|
||||||
|
@ -187,7 +185,7 @@ class FileBase(object):
|
||||||
if self.is_dangerous:
|
if self.is_dangerous:
|
||||||
return
|
return
|
||||||
self.set_property('safety_category', 'dangerous')
|
self.set_property('safety_category', 'dangerous')
|
||||||
# LOG: store reason string
|
# LOG: store reason string somewhere and do something with it
|
||||||
path, filename = os.path.split(self.dst_path)
|
path, filename = os.path.split(self.dst_path)
|
||||||
self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename))
|
self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename))
|
||||||
|
|
||||||
|
@ -250,11 +248,12 @@ class FileBase(object):
|
||||||
|
|
||||||
def write_log(self):
|
def write_log(self):
|
||||||
"""Print the logs related to the current file being processed."""
|
"""Print the logs related to the current file being processed."""
|
||||||
tmp_log = self.logger.log.fields(**self._file_props)
|
file_log = self.logger.add_file(self)
|
||||||
|
file_log.fields(**self._file_props)
|
||||||
|
|
||||||
|
|
||||||
class GroomerLogger(object):
|
class GroomerLogger(object):
|
||||||
"""Groomer logging interface"""
|
"""Groomer logging interface."""
|
||||||
|
|
||||||
def __init__(self, root_dir_path, debug=False):
|
def __init__(self, root_dir_path, debug=False):
|
||||||
self.root_dir = root_dir_path
|
self.root_dir = root_dir_path
|
||||||
|
@ -301,8 +300,7 @@ class GroomerLogger(object):
|
||||||
return s.hexdigest()
|
return s.hexdigest()
|
||||||
|
|
||||||
def add_file(self, file):
|
def add_file(self, file):
|
||||||
# return a sublog for the file
|
return self.log.name('file.src_path')
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class KittenGroomerBase(object):
|
class KittenGroomerBase(object):
|
||||||
|
|
Loading…
Reference in New Issue