Add config object to filecheck

- Grouped all configuration options for filecheck into a Config object
- Makes the code easier to read since no longer many references to different
configuration globals
pull/12/head
Dan Puttick 2017-02-22 11:29:20 -05:00
parent 7d62238270
commit a450fe6b96
1 changed files with 94 additions and 94 deletions

View File

@ -9,57 +9,42 @@ import zipfile
import oletools.oleid import oletools.oleid
import olefile import olefile
import officedissector import officedissector
import warnings import warnings
import exifread import exifread
from PIL import Image from PIL import Image
# from PIL import PngImagePlugin # from PIL import PngImagePlugin
from pdfid import PDFiD, cPDFiD from pdfid import PDFiD, cPDFiD
from kittengroomer import FileBase, KittenGroomerBase, main from kittengroomer import FileBase, KittenGroomerBase, main
SEVENZ_PATH = '/usr/bin/7z' SEVENZ_PATH = '/usr/bin/7z'
# Prepare application/<subtype> class Config:
mimes_ooxml = ['vnd.openxmlformats-officedocument.'] # Application subtypes (mimetype: 'application/<subtype>')
mimes_office = ['msword', 'vnd.ms-'] mimes_ooxml = ['vnd.openxmlformats-officedocument.']
mimes_libreoffice = ['vnd.oasis.opendocument'] mimes_office = ['msword', 'vnd.ms-']
mimes_rtf = ['rtf', 'richtext'] mimes_libreoffice = ['vnd.oasis.opendocument']
mimes_pdf = ['pdf', 'postscript'] mimes_rtf = ['rtf', 'richtext']
mimes_xml = ['xml'] mimes_pdf = ['pdf', 'postscript']
mimes_ms = ['dosexec'] mimes_xml = ['xml']
mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop', mimes_ms = ['dosexec']
mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop',
'xz', 'compress', 'gzip', 'tar'] 'xz', 'compress', 'gzip', 'tar']
mimes_data = ['octet-stream'] mimes_data = ['octet-stream']
# Prepare image/<subtype> # Image subtypes
mimes_exif = ['image/jpeg', 'image/tiff'] mimes_exif = ['image/jpeg', 'image/tiff']
mimes_png = ['image/png'] mimes_png = ['image/png']
# Mimetypes we can pull metadata from # Mimetypes with metadata
mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png'] mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png']
# Aliases # Commonly used malicious extensions
aliases = { # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
# Win executables # https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java
'application/x-msdos-program': 'application/x-dosexec', malicious_exts = (
'application/x-dosexec': 'application/x-msdos-program',
# Other apps with confusing mimetypes
'application/rtf': 'text/rtf',
}
# Sometimes, mimetypes.guess_type is giving unexpected results, such as for the .tar.gz files:
# In [12]: mimetypes.guess_type('toot.tar.gz', strict=False)
# Out[12]: ('application/x-tar', 'gzip')
# It works as expected if you do mimetypes.guess_type('application/gzip', strict=False)
propertype = {'.gz': 'application/gzip'}
# Commonly used malicious extensions
# Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
# https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java
MAL_EXTS = (
# Applications # Applications
".exe", ".pif", ".application", ".gadget", ".msi", ".msp", ".com", ".scr", ".exe", ".pif", ".application", ".gadget", ".msi", ".msp", ".com", ".scr",
".hta", ".cpl", ".msc", ".jar", ".hta", ".cpl", ".msc", ".jar",
@ -77,7 +62,22 @@ MAL_EXTS = (
# banned from wirecode # banned from wirecode
".asf", ".asx", ".au", ".htm", ".html", ".mht", ".vbs", ".asf", ".asx", ".au", ".htm", ".html", ".mht", ".vbs",
".wax", ".wm", ".wma", ".wmd", ".wmv", ".wmx", ".wmz", ".wvx", ".wax", ".wm", ".wma", ".wmd", ".wmv", ".wmx", ".wmz", ".wvx",
) )
# Aliases
aliases = {
# Win executables
'application/x-msdos-program': 'application/x-dosexec',
'application/x-dosexec': 'application/x-msdos-program',
# Other apps with confusing mimetypes
'application/rtf': 'text/rtf',
}
# Sometimes, mimetypes.guess_type gives unexpected results, such as for .tar.gz files:
# In [12]: mimetypes.guess_type('toot.tar.gz', strict=False)
# Out[12]: ('application/x-tar', 'gzip')
# It works as expected if you do mimetypes.guess_type('application/gzip', strict=False)
override_ext = {'.gz': 'application/gzip'}
class File(FileBase): class File(FileBase):
@ -101,7 +101,7 @@ class File(FileBase):
self.make_dangerous() self.make_dangerous()
if not self.has_extension(): if not self.has_extension():
self.make_dangerous() self.make_dangerous()
if self.extension in MAL_EXTS: if self.extension in Config.malicious_exts:
self.log_details.update({'malicious_extension': self.extension}) self.log_details.update({'malicious_extension': self.extension})
self.make_dangerous() self.make_dangerous()
@ -111,12 +111,12 @@ class File(FileBase):
module's list of valid mimetypes and the expected mimetype based on its module's list of valid mimetypes and the expected mimetype based on its
extension differs from the mimetype determined by libmagic, then it extension differs from the mimetype determined by libmagic, then it
marks the file as dangerous.""" marks the file as dangerous."""
if propertype.get(self.extension) is not None: if self.extension in Config.override_ext:
expected_mimetype = propertype.get(self.extension) expected_mimetype = Config.override_ext[self.extension]
else: else:
expected_mimetype, encoding = mimetypes.guess_type(self.src_path, strict=False) expected_mimetype, encoding = mimetypes.guess_type(self.src_path, strict=False)
if aliases.get(expected_mimetype) is not None: if expected_mimetype in Config.aliases:
expected_mimetype = aliases.get(expected_mimetype) expected_mimetype = Config.aliases[expected_mimetype]
is_known_extension = self.extension in mimetypes.types_map.keys() is_known_extension = self.extension in mimetypes.types_map.keys()
if is_known_extension and expected_mimetype != self.mimetype: if is_known_extension and expected_mimetype != self.mimetype:
self.log_details.update({'expected_mimetype': expected_mimetype}) self.log_details.update({'expected_mimetype': expected_mimetype})
@ -126,8 +126,8 @@ class File(FileBase):
"""Takes the mimetype (as determined by libmagic) and determines """Takes the mimetype (as determined by libmagic) and determines
whether the list of extensions that are normally associated with whether the list of extensions that are normally associated with
that extension contains the file's actual extension.""" that extension contains the file's actual extension."""
if aliases.get(self.mimetype) is not None: if self.mimetype in Config.aliases:
mimetype = aliases.get(self.mimetype) mimetype = Config.aliases[self.mimetype]
else: else:
mimetype = self.mimetype mimetype = self.mimetype
expected_extensions = mimetypes.guess_all_extensions(mimetype, strict=False) expected_extensions = mimetypes.guess_all_extensions(mimetype, strict=False)
@ -137,7 +137,7 @@ class File(FileBase):
self.make_dangerous() self.make_dangerous()
def has_metadata(self): def has_metadata(self):
if self.mimetype in mimes_metadata: if self.mimetype in Config.mimes_metadata:
return True return True
return False return False
@ -151,23 +151,23 @@ class KittenGroomerFileCheck(KittenGroomerBase):
self.log_name = self.logger.log self.log_name = self.logger.log
subtypes_apps = [ subtypes_apps = [
(mimes_office, self._winoffice), (Config.mimes_office, self._winoffice),
(mimes_ooxml, self._ooxml), (Config.mimes_ooxml, self._ooxml),
(mimes_rtf, self.text), (Config.mimes_rtf, self.text),
(mimes_libreoffice, self._libreoffice), (Config.mimes_libreoffice, self._libreoffice),
(mimes_pdf, self._pdf), (Config.mimes_pdf, self._pdf),
(mimes_xml, self.text), (Config.mimes_xml, self.text),
(mimes_ms, self._executables), (Config.mimes_ms, self._executables),
(mimes_compressed, self._archive), (Config.mimes_compressed, self._archive),
(mimes_data, self._binary_app), (Config.mimes_data, self._binary_app),
] ]
self.subtypes_application = self._init_subtypes_application(subtypes_apps) self.app_subtype_methods = self._make_method_dict(subtypes_apps)
types_metadata = [ types_metadata = [
(mimes_exif, self._metadata_exif), (Config.mimes_exif, self._metadata_exif),
(mimes_png, self._metadata_png), (Config.mimes_png, self._metadata_png),
] ]
self.metadata_processing_options = self._init_subtypes_application(types_metadata) self.metadata_mimetype_methods = self._make_method_dict(types_metadata)
self.mime_processing_options = { self.mime_processing_options = {
'text': self.text, 'text': self.text,
@ -183,17 +183,17 @@ class KittenGroomerFileCheck(KittenGroomerBase):
} }
# ##### Helper functions ##### # ##### Helper functions #####
def _init_subtypes_application(self, subtypes_application): def _make_method_dict(self, list_of_tuples):
"""Creates a dictionary with the right method based on the sub mime type.""" """Returns a dictionary with mimetype: method pairs."""
subtype_dict = {} dict_to_return = {}
for list_subtypes, func in subtypes_application: for list_of_subtypes, method in list_of_tuples:
for st in list_subtypes: for subtype in list_of_subtypes:
subtype_dict[st] = func dict_to_return[subtype] = method
return subtype_dict return dict_to_return
def _print_log(self): def _print_log(self):
"""Print the logs related to the current file being processed.""" """Print the logs related to the current file being processed."""
# TODO: change name to _write_log # TODO: change name to _write_log, move to helpers
tmp_log = self.logger.log.fields(**self.cur_file.log_details) tmp_log = self.logger.log.fields(**self.cur_file.log_details)
if self.cur_file.is_dangerous(): if self.cur_file.is_dangerous():
tmp_log.warning(self.cur_file.log_string) tmp_log.warning(self.cur_file.log_string)
@ -205,7 +205,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
def _run_process(self, command_string, timeout=None): def _run_process(self, command_string, timeout=None):
"""Run command_string in a subprocess, wait until it finishes.""" """Run command_string in a subprocess, wait until it finishes."""
args = shlex.split(command_string) args = shlex.split(command_string)
# TODO: log_debug_err and log_debug are now broken, fix # TODO: log_debug_err and log_debug are now broken, fix, move to helpers
with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout: with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout:
try: try:
subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout) subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout)
@ -250,15 +250,15 @@ class KittenGroomerFileCheck(KittenGroomerBase):
# ##### Files that will be converted ###### # ##### Files that will be converted ######
def text(self): def text(self):
"""Process an rtf, ooxml, or plaintext file.""" """Process an rtf, ooxml, or plaintext file."""
for r in mimes_rtf: for mt in Config.mimes_rtf:
if r in self.cur_file.sub_type: if mt in self.cur_file.sub_type:
self.cur_file.log_string += 'Rich Text file' self.cur_file.log_string += 'Rich Text file'
# TODO: need a way to convert it to plain text # TODO: need a way to convert it to plain text
self.cur_file.force_ext('.txt') self.cur_file.force_ext('.txt')
self._safe_copy() self._safe_copy()
return return
for o in mimes_ooxml: for mt in Config.mimes_ooxml:
if o in self.cur_file.sub_type: if mt in self.cur_file.sub_type:
self.cur_file.log_string += 'OOXML File' self.cur_file.log_string += 'OOXML File'
self._ooxml() self._ooxml()
return return
@ -268,9 +268,9 @@ class KittenGroomerFileCheck(KittenGroomerBase):
def application(self): def application(self):
"""Processes an application specific file according to its subtype.""" """Processes an application specific file according to its subtype."""
for subtype, fct in self.subtypes_application.items(): for subtype, method in self.app_subtype_methods.items():
if subtype in self.cur_file.sub_type: if subtype in self.cur_file.sub_type:
fct() method()
self.cur_file.log_string += 'Application file' self.cur_file.log_string += 'Application file'
return return
self.cur_file.log_string += 'Unknown Application file' self.cur_file.log_string += 'Unknown Application file'
@ -401,8 +401,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
extract_command = '{} -p1 x "{}" -o"{}" -bd -aoa'.format(SEVENZ_PATH, self.cur_file.src_path, tmpdir) extract_command = '{} -p1 x "{}" -o"{}" -bd -aoa'.format(SEVENZ_PATH, self.cur_file.src_path, tmpdir)
self._run_process(extract_command) self._run_process(extract_command)
self.recursive_archive_depth += 1 self.recursive_archive_depth += 1
# Broken so commenting out for now: self.logger.tree(tmpdir)
# self.tree(tmpdir)
self.processdir(tmpdir, self.cur_file.dst_path) self.processdir(tmpdir, self.cur_file.dst_path)
self.recursive_archive_depth -= 1 self.recursive_archive_depth -= 1
self._safe_rmtree(tmpdir) self._safe_rmtree(tmpdir)
@ -488,10 +487,10 @@ class KittenGroomerFileCheck(KittenGroomerBase):
def extract_metadata(self): def extract_metadata(self):
metadata_file_path = self.cur_file.create_metadata_file(".metadata.txt") metadata_file_path = self.cur_file.create_metadata_file(".metadata.txt")
# todo: write metadata to file mt = self.cur_file.mimetype
mime = self.cur_file.mimetype metadata_processing_method = self.metadata_mimetype_methods.get(mt)
metadata_processing_method = self.metadata_processing_options.get(mime)
if metadata_processing_method: if metadata_processing_method:
# TODO: should we return metadata and write it here instead of in processing method?
metadata_processing_method(metadata_file_path) metadata_processing_method(metadata_file_path)
####################### #######################
@ -565,6 +564,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
def processdir(self, src_dir=None, dst_dir=None): def processdir(self, src_dir=None, dst_dir=None):
"""Main function coordinating file processing.""" """Main function coordinating file processing."""
# TODO: do we need defaults here?
if src_dir is None: if src_dir is None:
src_dir = self.src_root_dir src_dir = self.src_root_dir
if dst_dir is None: if dst_dir is None: