diff --git a/bin/analyse.py b/bin/analyse.py new file mode 100644 index 0000000..494aeac --- /dev/null +++ b/bin/analyse.py @@ -0,0 +1,573 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import mimetypes +import subprocess +import zipfile + +import oletools.oleid +import olefile +import officedissector +import warnings +import exifread +from PIL import Image +# TODO: why do we have this import? How does filecheck handle pngs? +# from PIL import PngImagePlugin +from pdfid import PDFiD, cPDFiD +from kittengroomer import FileBase + +CLAMDSCAN = "/usr/local/bin/clamdscan" + +class Config: + """Configuration information for Filecheck.""" + + # Application subtypes (mimetype: 'application/') + mimes_ooxml = ['vnd.openxmlformats-officedocument.'] + mimes_office = ['msword', 'vnd.ms-'] + mimes_libreoffice = ['vnd.oasis.opendocument'] + mimes_rtf = ['rtf', 'richtext'] + mimes_pdf = ['pdf', 'postscript'] + mimes_xml = ['xml'] + mimes_ms = ['dosexec'] + mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop', + 'xz', 'compress', 'gzip', 'tar'] + mimes_data = ['octet-stream'] + + # Image subtypes + mimes_exif = ['image/jpeg', 'image/tiff'] + mimes_png = ['image/png'] + + # Mimetypes with metadata + mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png'] + + # Compressed files susceptible to have double extension + double_exts = (".uzip", ".lzma", ".z", ".xz", ".lz", ".gz2", ".gz", + ".bz2", ".pack", ".rar", ".000") + + # Commonly used malicious extensions + # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ + # https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java + malicious_exts = ( + # Applications + ".exe", ".pif", ".application", ".gadget", ".msi", ".msp", ".com", ".scr", + ".hta", ".cpl", ".msc", ".jar", + # Scripts + ".bat", ".cmd", ".vb", ".vbs", ".vbe", ".js", ".jse", ".ws", ".wsf", + ".wsc", ".wsh", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2", + ".msh", ".msh1", ".msh2", ".mshxml", ".msh1xml", ".msh2xml", + # Shortcuts + ".scf", ".lnk", ".inf", + # Other + ".reg", ".dll", + # Office macro (OOXML with macro enabled) + ".docm", ".dotm", ".xlsm", ".xltm", ".xlam", ".pptm", ".potm", ".ppam", + ".ppsm", ".sldm", + # banned from wirecode + ".asf", ".asx", ".au", ".htm", ".html", ".mht", ".vbs", + ".wax", ".wm", ".wma", ".wmd", ".wmv", ".wmx", ".wmz", ".wvx", + # Google chrome malicious extensions + ".ad", ".ade", ".adp", ".ah", ".apk", ".app", ".application", ".asp", + ".asx", ".bas", ".bash", ".bat", ".cfg", ".chi", ".chm", ".class", + ".cmd", ".com", ".command", ".crt", ".crx", ".csh", ".deb", ".dex", + ".dll", ".drv", ".exe", ".fxp", ".grp", ".hlp", ".hta", ".htm", ".html", + ".htt", ".inf", ".ini", ".ins", ".isp", ".jar", ".jnlp", ".user.js", + ".js", ".jse", ".ksh", ".lnk", ".local", ".mad", ".maf", ".mag", ".mam", + ".manifest", ".maq", ".mar", ".mas", ".mat", ".mau", ".mav", ".maw", + ".mda", ".mdb", ".mde", ".mdt", ".mdw", ".mdz", ".mht", ".mhtml", ".mmc", + ".mof", ".msc", ".msh", ".mshxml", ".msi", ".msp", ".mst", ".ocx", ".ops", + ".pcd", ".pif", ".pkg", ".pl", ".plg", ".prf", ".prg", ".pst", ".py", + ".pyc", ".pyw", ".rb", ".reg", ".rpm", ".scf", ".scr", ".sct", ".sh", + ".shar", ".shb", ".shs", ".shtm", ".shtml", ".spl", ".svg", ".swf", ".sys", + ".tcsh", ".url", ".vb", ".vbe", ".vbs", ".vsd", ".vsmacros", ".vss", + ".vst", ".vsw", ".ws", ".wsc", ".wsf", ".wsh", ".xbap", ".xht", ".xhtm", + ".xhtml", ".xml", ".xsl", ".xslt", ".website", ".msh1", ".msh2", ".msh1xml", + ".msh2xml", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2", ".xnk", + ".appref-ms", ".gadget", ".efi", ".fon", ".partial", ".svg", ".xml", + ".xrm_ms", ".xsl", ".action", ".bin", ".inx", ".ipa", ".isu", ".job", + ".out", ".pad", ".paf", ".rgs", ".u3p", ".vbscript", ".workflow", ".001", + ".ace", ".arc", ".arj", ".b64", ".balz", ".bhx", ".cab", ".cpio", ".fat", + ".hfs", ".hqx", ".iso", ".lha", ".lpaq1", ".lpaq5", ".lpaq8", ".lzh", + ".mim", ".ntfs", ".paq8f", ".paq8jd", ".paq8l", ".paq8o", ".pea", ".quad", + ".r00", ".r01", ".r02", ".r03", ".r04", ".r05", ".r06", ".r07", ".r08", + ".r09", ".r10", ".r11", ".r12", ".r13", ".r14", ".r15", ".r16", ".r17", + ".r18", ".r19", ".r20", ".r21", ".r22", ".r23", ".r24", ".r25", ".r26", + ".r27", ".r28", ".r29", ".squashfs", ".swm", ".tpz", ".txz", ".tz", ".udf", + ".uu", ".uue", ".vhd", ".vmdk", ".wim", ".wrc", ".xar", ".xxe", ".z", + ".zipx", ".zpaq", ".cdr", ".dart", ".dc42", ".diskcopy42", ".dmg", + ".dmgpart", ".dvdr", ".img", ".imgpart", ".ndif", ".smi", ".sparsebundle", + ".sparseimage", ".toast", ".udif", + ) + + # Forbidden char in filename + dangerous_char = ("\\", "/", "<", ">", "%", "*", "~", "?", "$", ":", ";", ".", " ", u"\u202E") + + # Only char accepted in filename + accepted_char = "ABCDEFGHIJKLMNOPQRSTUVWXYZ-abcdefghijklmnopqrstuvwxyzàéôù_0123456789" + + # Attention aux options + script_endline = ("sh", "bash", "dash", "python", "perl") + script_firstchar = ("#") + + # Aliases + aliases = { + # Win executables + 'application/x-msdos-program': 'application/x-dosexec', + 'application/x-dosexec': 'application/x-msdos-program', + # Other apps with confusing mimetypes + 'application/rtf': 'text/rtf', + } + + # Sometimes, mimetypes.guess_type gives unexpected results, such as for .tar.gz files: + # In [12]: mimetypes.guess_type('toot.tar.gz', strict=False) + # Out[12]: ('application/x-tar', 'gzip') + # It works as expected if you do mimetypes.guess_type('application/gzip', strict=False) + override_ext = {'.gz': 'application/gzip'} + + ignored_mimes = ['inode', 'model', 'multipart', 'example'] + + archive_timeout = None + + + + +class File(FileBase): + + def __init__(self, src_path, dst_path, logger): + super(File, self).__init__(src_path, dst_path) + self.is_recursive = False + self.logger = logger + self.tempdir_path = self.dst_path + '_temp' + + subtypes_apps = [ + (Config.mimes_office, self._winoffice), + (Config.mimes_ooxml, self._ooxml), + (Config.mimes_rtf, self.text), + (Config.mimes_libreoffice, self._libreoffice), + (Config.mimes_pdf, self._pdf), + (Config.mimes_xml, self.text), + (Config.mimes_ms, self._executables), + (Config.mimes_compressed, self._archive), + (Config.mimes_data, self._binary_app), + ] + self.app_subtype_methods = self._make_method_dict(subtypes_apps) + + types_metadata = [ + (Config.mimes_exif, self._metadata_exif), + (Config.mimes_png, self._metadata_png), + ] + self.metadata_mimetype_methods = self._make_method_dict(types_metadata) + + self.mime_processing_options = { + 'text': self.text, + 'audio': self.audio, + 'image': self.image, + 'video': self.video, + 'application': self.application, + 'example': self.example, + 'message': self.message, + 'model': self.model, + 'multipart': self.multipart, + 'inode': self.inode, + } + + def _check_dangerous(self): + if not self.has_mimetype: + self.make_dangerous('File has no mimetype') + if not self.has_extension: + self.make_dangerous('File has no extension') + if self.extension in Config.malicious_exts: + self.make_dangerous('Extension identifies file as potentially dangerous') + + def _check_extension(self): + """ + Guess the file's mimetype based on its extension. + + If the file's mimetype (as determined by libmagic) is contained in + the `mimetype` module's list of valid mimetypes and the expected + mimetype based on its extension differs from the mimetype determined + by libmagic, then mark the file as dangerous. + """ + if self.extension in Config.override_ext: + expected_mimetype = Config.override_ext[self.extension] + else: + expected_mimetype, encoding = mimetypes.guess_type(self.src_path, + strict=False) + if expected_mimetype in Config.aliases: + expected_mimetype = Config.aliases[expected_mimetype] + is_known_extension = self.extension in mimetypes.types_map.keys() + if is_known_extension and expected_mimetype != self.mimetype: + self.make_dangerous('Mimetype does not match expected mimetype for this extension') + + def _check_mimetype(self): + """ + Compare mimetype (as determined by libmagic) to extension. + + Determine whether the extension that are normally associated with + the mimetype include the file's actual extension. + """ + if self.mimetype in Config.aliases: + mimetype = Config.aliases[self.mimetype] + else: + mimetype = self.mimetype + expected_extensions = mimetypes.guess_all_extensions(mimetype, + strict=False) + if expected_extensions: + if self.has_extension and self.extension not in expected_extensions: + self.make_dangerous('Extension does not match expected extensions for this mimetype') + + def _check_filename(self): + """ + Delete forbidden characters to avoid name obfuscation (see class Config). + + Delete dots to avoid double extensions. + """ + dir_path, file = os.path.split(self.dst_path) + name, ext = os.path.splitext(self.filename) + + if not self.extension in Config.double_exts: + for c in name: + if c in Config.accepted_char : + self.make_dangerous('Filename contains forbidden character') + name = name.replace(c, '-') + + # TODO: change self.filename and'filename' property? Or should those reflect the values on the source key + self.dst_path = os.path.join(dir_path, name + ext) + + + def check(self): + if self.main_type in Config.ignored_mimes: + self.should_copy = False + self.mime_processing_options.get(self.main_type, self.unknown)() + else: + self._check_dangerous() + self._check_filename() + if self.has_extension: + self._check_extension() + if self.has_mimetype: + self._check_mimetype() + if not self.is_dangerous: + self.mime_processing_options.get(self.main_type, self.unknown)() + + def write_log(self): + props = self.get_all_props() + if not self.is_recursive: + if os.path.exists(self.tempdir_path): + # Hack to make images appear at the correct tree depth in log + self.logger.add_file(self.src_path, props, in_tempdir=True) + return + self.logger.add_file(self.src_path, props) + + # ##### Helper functions ##### + def _make_method_dict(self, list_of_tuples): + """Returns a dictionary with mimetype: method pairs.""" + dict_to_return = {} + for list_of_subtypes, method in list_of_tuples: + for subtype in list_of_subtypes: + dict_to_return[subtype] = method + return dict_to_return + + @property + def has_metadata(self): + """True if filetype typically contains metadata, else False.""" + if self.mimetype in Config.mimes_metadata: + return True + return False + + def make_tempdir(self): + """Make a temporary directory at self.tempdir_path.""" + if not os.path.exists(self.tempdir_path): + os.makedirs(self.tempdir_path) + return self.tempdir_path + + ####################### + # ##### Discarded mimetypes, reason in the docstring ###### + def inode(self): + """Empty file or symlink.""" + if self.is_symlink: + symlink_path = self.get_property('symlink') + self.add_description('File is a symlink to {}'.format(symlink_path)) + else: + self.add_description('File is an inode (empty file)') + + def unknown(self): + """Main type should never be unknown.""" + self.add_description('Unknown mimetype') + + def example(self): + """Used in examples, should never be returned by libmagic.""" + self.add_description('Example file') + + def multipart(self): + """Used in web apps, should never be returned by libmagic""" + self.add_description('Multipart file - usually found in web apps') + + # ##### Treated as malicious, no reason to have it on a USB key ###### + def message(self): + """Process a message file.""" + self.make_dangerous('Message file - should not be found on USB key') + + def model(self): + """Process a model file.""" + self.make_dangerous('Model file - should not be found on USB key') + + # ##### Files that will be converted ###### + def text(self): + """Process an rtf, ooxml, or plaintext file.""" + for mt in Config.mimes_rtf: + if mt in self.sub_type: + self.add_description('Rich Text (rtf) file') + # TODO: need a way to convert it to plain text + self.force_ext('.txt') + return + for mt in Config.mimes_ooxml: + if mt in self.sub_type: + self.add_description('OOXML (openoffice) file') + self._ooxml() + return + self.add_description('Plain text file') + self.force_ext('.txt') + + def application(self): + """Process an application specific file according to its subtype.""" + for subtype, method in self.app_subtype_methods.items(): + if subtype in self.sub_type: + # TODO: should we change the logic so we don't iterate through all of the subtype methods? + # TODO: should these methods return a value? + method() + return + self._unknown_app() + + def _executables(self): + """Process an executable file.""" + # LOG: change the processing_type property to some other name or include in file_string + self._clamdscan() + self.make_dangerous('Executable file') + + def _winoffice(self): + """Process a winoffice file using olefile/oletools.""" + oid = oletools.oleid.OleID(self.src_path) # First assume a valid file + if not olefile.isOleFile(self.src_path): + # Manual processing, may already count as suspicious + try: + ole = olefile.OleFileIO(self.src_path, raise_defects=olefile.DEFECT_INCORRECT) + except: + self.make_dangerous('Unparsable WinOffice file') + if ole.parsing_issues: + self.make_dangerous('Parsing issues with WinOffice file') + else: + if ole.exists('macros/vba') or ole.exists('Macros') \ + or ole.exists('_VBA_PROJECT_CUR') or ole.exists('VBA'): + self.make_dangerous('WinOffice file containing a macro') + else: + indicators = oid.check() + # Encrypted can be set by multiple checks on the script + if oid.encrypted.value: + self.make_dangerous('Encrypted WinOffice file') + if oid.macros.value or oid.ole.exists('macros/vba') or oid.ole.exists('Macros') \ + or oid.ole.exists('_VBA_PROJECT_CUR') or oid.ole.exists('VBA'): + self.make_dangerous('WinOffice file containing a macro') + for i in indicators: + if i.id == 'ObjectPool' and i.value: + # TODO: is having an ObjectPool suspicious? + # LOG: user defined property + self.add_description('WinOffice file containing an object pool') + elif i.id == 'flash' and i.value: + self.make_dangerous('WinOffice file with embedded flash') + self.add_description('WinOffice file') + + def _ooxml(self): + """Process an ooxml file.""" + try: + doc = officedissector.doc.Document(self.src_path) + except Exception: + self.make_dangerous('Invalid ooxml file') + return + # There are probably other potentially malicious features: + # fonts, custom props, custom XML + if doc.is_macro_enabled or len(doc.features.macros) > 0: + self.make_dangerous('Ooxml file containing macro') + if len(doc.features.embedded_controls) > 0: + self.make_dangerous('Ooxml file with activex') + if len(doc.features.embedded_objects) > 0: + # Exploited by CVE-2014-4114 (OLE) + self.make_dangerous('Ooxml file with embedded objects') + if len(doc.features.embedded_packages) > 0: + self.make_dangerous('Ooxml file with embedded packages') + if not self.is_dangerous: + self.add_description('OOXML file') + + def _libreoffice(self): + """Process a libreoffice file.""" + # As long as there is no way to do a sanity check on the files => dangerous + try: + lodoc = zipfile.ZipFile(self.src_path, 'r') + except: + # TODO: are there specific exceptions we should catch here? Or should it be everything + self.make_dangerous('Invalid libreoffice file') + for f in lodoc.infolist(): + fname = f.filename.lower() + if fname.startswith('script') or fname.startswith('basic') or \ + fname.startswith('object') or fname.endswith('.bin'): + self.make_dangerous('Libreoffice file containing executable code') + if not self.is_dangerous: + self.add_description('Libreoffice file') + + def _pdf(self): + """Process a PDF file.""" + xmlDoc = PDFiD(self.src_path) + oPDFiD = cPDFiD(xmlDoc, True) + # TODO: are there other pdf characteristics which should be dangerous? + if oPDFiD.encrypt.count > 0: + self.make_dangerous('Encrypted pdf') + if oPDFiD.js.count > 0 or oPDFiD.javascript.count > 0: + self.make_dangerous('Pdf with embedded javascript') + if oPDFiD.aa.count > 0 or oPDFiD.openaction.count > 0: + self.make_dangerous('Pdf with openaction(s)') + if oPDFiD.richmedia.count > 0: + self.make_dangerous('Pdf containing flash') + if oPDFiD.launch.count > 0: + self.make_dangerous('Pdf with launch action(s)') + if oPDFiD.xfa.count > 0: + self.make_dangerous('Pdf with XFA structures') + if oPDFiD.objstm.count > 0: + self.make_dangerous('Pdf with ObjectStream structures') + if not self.is_dangerous: + self.add_description('Pdf file') + + def _archive(self): + """ + Process an archive using 7zip. + + The archive is extracted to a temporary directory and self.process_dir + is called on that directory. The recursive archive depth is increased + to protect against archive bombs. + """ + # TODO: change this to something archive type specific instead of generic 'Archive' + self.add_description('Archive') + self.should_copy = False + self.is_recursive = True + + def _unknown_app(self): + """Process an unknown file.""" + self.add_description('Unknown application file') + self.make_unknown() + + def _binary_app(self): + """Process an unknown binary file.""" + self.add_description('Unknown binary file') + self.make_binary() + + ####################### + # Metadata extractors + def _metadata_exif(self, metadata_file_path): + """Read exif metadata from a jpg or tiff file using exifread.""" + # TODO: can we shorten this method somehow? + img = open(self.src_path, 'rb') + tags = None + try: + tags = exifread.process_file(img, debug=True) + except Exception as e: + self.add_error(e, "Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path)) + if tags is None: + try: + tags = exifread.process_file(img, debug=True) + except Exception as e: + self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path)) + img.close() + return False + for tag in sorted(tags.keys()): + # These tags are long and obnoxious/binary so we don't add them + if tag not in ('JPEGThumbnail', 'TIFFThumbnail'): + tag_string = str(tags[tag]) + # Exifreader truncates data. + if len(tag_string) > 25 and tag_string.endswith(", ... ]"): + tag_value = tags[tag].values + tag_string = str(tag_value) + with open(metadata_file_path, 'w+') as metadata_file: + metadata_file.write("Key: {}\tValue: {}\n".format(tag, tag_string)) + # TODO: how do we want to log metadata? + self.set_property('metadata', 'exif') + img.close() + return True + + def _metadata_png(self, metadata_file_path): + """Extract metadata from a png file using PIL/Pillow.""" + warnings.simplefilter('error', Image.DecompressionBombWarning) + try: + img = Image.open(self.src_path) + for tag in sorted(img.info.keys()): + # These are long and obnoxious/binary + if tag not in ('icc_profile'): + with open(metadata_file_path, 'w+') as metadata_file: + metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag])) + # LOG: handle metadata + self.set_property('metadata', 'png') + img.close() + except Exception as e: # Catch decompression bombs + # TODO: only catch DecompressionBombWarnings here? + self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path)) + self.make_dangerous('exception processing metadata') + return False + + def extract_metadata(self): + """Create metadata file and call correct metadata extraction method.""" + metadata_file_path = self.create_metadata_file(".metadata.txt") + mt = self.mimetype + metadata_processing_method = self.metadata_mimetype_methods.get(mt) + if metadata_processing_method: + # TODO: should we return metadata and write it here instead of in processing method? + metadata_processing_method(metadata_file_path) + + ####################### + # ##### Media - audio and video aren't converted ###### + def audio(self): + """Process an audio file.""" + self.add_description('Audio file') + self._media_processing() + + def video(self): + """Process a video.""" + self.add_description('Video file') + self._media_processing() + + def _media_processing(self): + """Generic way to process all media files.""" + self.add_description('Media file') + + def image(self): + """ + Process an image. + + Extracts metadata to dest key using self.extract_metada() if metadata + is present. Creates a temporary directory on dest key, opens the image + using PIL.Image, saves it to the temporary directory, and copies it to + the destination. + """ + # TODO: make sure this method works for png, gif, tiff + if self.has_metadata: + self.extract_metadata() + tempdir_path = self.make_tempdir() + tempfile_path = os.path.join(tempdir_path, self.filename) + warnings.simplefilter('error', Image.DecompressionBombWarning) + try: # Do image conversions + img_in = Image.open(self.src_path) + img_out = Image.frombytes(img_in.mode, img_in.size, img_in.tobytes()) + img_out.save(tempfile_path) + self.src_path = tempfile_path + except Exception as e: # Catch decompression bombs + # TODO: change this from all Exceptions to specific DecompressionBombWarning + self.add_error(e, "Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) + self.make_dangerous('Image file containing decompression bomb') + if not self.is_dangerous: + self.add_description('Image file') + + def _clamdscan(self): + """ + Scan a file with ClamAV. Requires clamd to be running. + """ + output = subprocess.Popen([CLAMDSCAN, self.src_path], stdout = subprocess.PIPE).communicate()[0] + check = output.split(b'\n')[0].split(b': ')[1] + if b'OK' not in check: + make_dangerous('File has been detected as a virus') + diff --git a/bin/filecheck.py b/bin/filecheck.py index 536f4e1..1c06e84 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -1,543 +1,21 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + import os -import mimetypes import shlex import subprocess -import zipfile import argparse import shutil -import oletools.oleid -import olefile -import officedissector -import warnings -import exifread -from PIL import Image -# TODO: why do we have this import? How does filecheck handle pngs? -# from PIL import PngImagePlugin -from pdfid import PDFiD, cPDFiD +from kittengroomer import KittenGroomerBase, Logging +from analyse import Config, File + -from kittengroomer import FileBase, KittenGroomerBase, Logging SEVENZ_PATH = '/usr/bin/7z' -class Config: - """Configuration information for Filecheck.""" - - # Application subtypes (mimetype: 'application/') - mimes_ooxml = ['vnd.openxmlformats-officedocument.'] - mimes_office = ['msword', 'vnd.ms-'] - mimes_libreoffice = ['vnd.oasis.opendocument'] - mimes_rtf = ['rtf', 'richtext'] - mimes_pdf = ['pdf', 'postscript'] - mimes_xml = ['xml'] - mimes_ms = ['dosexec'] - mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop', - 'xz', 'compress', 'gzip', 'tar'] - mimes_data = ['octet-stream'] - - # Image subtypes - mimes_exif = ['image/jpeg', 'image/tiff'] - mimes_png = ['image/png'] - - # Mimetypes with metadata - mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png'] - - # Commonly used malicious extensions - # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ - # https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java - malicious_exts = ( - # Applications - ".exe", ".pif", ".application", ".gadget", ".msi", ".msp", ".com", ".scr", - ".hta", ".cpl", ".msc", ".jar", - # Scripts - ".bat", ".cmd", ".vb", ".vbs", ".vbe", ".js", ".jse", ".ws", ".wsf", - ".wsc", ".wsh", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2", - ".msh", ".msh1", ".msh2", ".mshxml", ".msh1xml", ".msh2xml", - # Shortcuts - ".scf", ".lnk", ".inf", - # Other - ".reg", ".dll", - # Office macro (OOXML with macro enabled) - ".docm", ".dotm", ".xlsm", ".xltm", ".xlam", ".pptm", ".potm", ".ppam", - ".ppsm", ".sldm", - # banned from wirecode - ".asf", ".asx", ".au", ".htm", ".html", ".mht", ".vbs", - ".wax", ".wm", ".wma", ".wmd", ".wmv", ".wmx", ".wmz", ".wvx", - # Google chrome malicious extensions - ".ad", ".ade", ".adp", ".ah", ".apk", ".app", ".application", ".asp", - ".asx", ".bas", ".bash", ".bat", ".cfg", ".chi", ".chm", ".class", - ".cmd", ".com", ".command", ".crt", ".crx", ".csh", ".deb", ".dex", - ".dll", ".drv", ".exe", ".fxp", ".grp", ".hlp", ".hta", ".htm", ".html", - ".htt", ".inf", ".ini", ".ins", ".isp", ".jar", ".jnlp", ".user.js", - ".js", ".jse", ".ksh", ".lnk", ".local", ".mad", ".maf", ".mag", ".mam", - ".manifest", ".maq", ".mar", ".mas", ".mat", ".mau", ".mav", ".maw", - ".mda", ".mdb", ".mde", ".mdt", ".mdw", ".mdz", ".mht", ".mhtml", ".mmc", - ".mof", ".msc", ".msh", ".mshxml", ".msi", ".msp", ".mst", ".ocx", ".ops", - ".pcd", ".pif", ".pkg", ".pl", ".plg", ".prf", ".prg", ".pst", ".py", - ".pyc", ".pyw", ".rb", ".reg", ".rpm", ".scf", ".scr", ".sct", ".sh", - ".shar", ".shb", ".shs", ".shtm", ".shtml", ".spl", ".svg", ".swf", ".sys", - ".tcsh", ".url", ".vb", ".vbe", ".vbs", ".vsd", ".vsmacros", ".vss", - ".vst", ".vsw", ".ws", ".wsc", ".wsf", ".wsh", ".xbap", ".xht", ".xhtm", - ".xhtml", ".xml", ".xsl", ".xslt", ".website", ".msh1", ".msh2", ".msh1xml", - ".msh2xml", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2", ".xnk", - ".appref-ms", ".gadget", ".efi", ".fon", ".partial", ".svg", ".xml", - ".xrm_ms", ".xsl", ".action", ".bin", ".inx", ".ipa", ".isu", ".job", - ".out", ".pad", ".paf", ".rgs", ".u3p", ".vbscript", ".workflow", ".001", - ".ace", ".arc", ".arj", ".b64", ".balz", ".bhx", ".cab", ".cpio", ".fat", - ".hfs", ".hqx", ".iso", ".lha", ".lpaq1", ".lpaq5", ".lpaq8", ".lzh", - ".mim", ".ntfs", ".paq8f", ".paq8jd", ".paq8l", ".paq8o", ".pea", ".quad", - ".r00", ".r01", ".r02", ".r03", ".r04", ".r05", ".r06", ".r07", ".r08", - ".r09", ".r10", ".r11", ".r12", ".r13", ".r14", ".r15", ".r16", ".r17", - ".r18", ".r19", ".r20", ".r21", ".r22", ".r23", ".r24", ".r25", ".r26", - ".r27", ".r28", ".r29", ".squashfs", ".swm", ".tpz", ".txz", ".tz", ".udf", - ".uu", ".uue", ".vhd", ".vmdk", ".wim", ".wrc", ".xar", ".xxe", ".z", - ".zipx", ".zpaq", ".cdr", ".dart", ".dc42", ".diskcopy42", ".dmg", - ".dmgpart", ".dvdr", ".img", ".imgpart", ".ndif", ".smi", ".sparsebundle", - ".sparseimage", ".toast", ".udif", - ) - - # Aliases - aliases = { - # Win executables - 'application/x-msdos-program': 'application/x-dosexec', - 'application/x-dosexec': 'application/x-msdos-program', - # Other apps with confusing mimetypes - 'application/rtf': 'text/rtf', - } - - # Sometimes, mimetypes.guess_type gives unexpected results, such as for .tar.gz files: - # In [12]: mimetypes.guess_type('toot.tar.gz', strict=False) - # Out[12]: ('application/x-tar', 'gzip') - # It works as expected if you do mimetypes.guess_type('application/gzip', strict=False) - override_ext = {'.gz': 'application/gzip'} - - ignored_mimes = ['inode', 'model', 'multipart', 'example'] - - -class File(FileBase): - - def __init__(self, src_path, dst_path, logger): - super(File, self).__init__(src_path, dst_path) - self.is_recursive = False - self.logger = logger - self.tempdir_path = self.dst_path + '_temp' - - subtypes_apps = [ - (Config.mimes_office, self._winoffice), - (Config.mimes_ooxml, self._ooxml), - (Config.mimes_rtf, self.text), - (Config.mimes_libreoffice, self._libreoffice), - (Config.mimes_pdf, self._pdf), - (Config.mimes_xml, self.text), - (Config.mimes_ms, self._executables), - (Config.mimes_compressed, self._archive), - (Config.mimes_data, self._binary_app), - ] - self.app_subtype_methods = self._make_method_dict(subtypes_apps) - - types_metadata = [ - (Config.mimes_exif, self._metadata_exif), - (Config.mimes_png, self._metadata_png), - ] - self.metadata_mimetype_methods = self._make_method_dict(types_metadata) - - self.mime_processing_options = { - 'text': self.text, - 'audio': self.audio, - 'image': self.image, - 'video': self.video, - 'application': self.application, - 'example': self.example, - 'message': self.message, - 'model': self.model, - 'multipart': self.multipart, - 'inode': self.inode, - } - - def _check_dangerous(self): - if not self.has_mimetype: - self.make_dangerous('File has no mimetype') - if not self.has_extension: - self.make_dangerous('File has no extension') - if self.extension in Config.malicious_exts: - self.make_dangerous('Extension identifies file as potentially dangerous') - - def _check_extension(self): - """ - Guess the file's mimetype based on its extension. - - If the file's mimetype (as determined by libmagic) is contained in - the `mimetype` module's list of valid mimetypes and the expected - mimetype based on its extension differs from the mimetype determined - by libmagic, then mark the file as dangerous. - """ - if self.extension in Config.override_ext: - expected_mimetype = Config.override_ext[self.extension] - else: - expected_mimetype, encoding = mimetypes.guess_type(self.src_path, - strict=False) - if expected_mimetype in Config.aliases: - expected_mimetype = Config.aliases[expected_mimetype] - is_known_extension = self.extension in mimetypes.types_map.keys() - if is_known_extension and expected_mimetype != self.mimetype: - self.make_dangerous('Mimetype does not match expected mimetype for this extension') - - def _check_mimetype(self): - """ - Compare mimetype (as determined by libmagic) to extension. - - Determine whether the extension that are normally associated with - the mimetype include the file's actual extension. - """ - if self.mimetype in Config.aliases: - mimetype = Config.aliases[self.mimetype] - else: - mimetype = self.mimetype - expected_extensions = mimetypes.guess_all_extensions(mimetype, - strict=False) - if expected_extensions: - if self.has_extension and self.extension not in expected_extensions: - self.make_dangerous('Extension does not match expected extensions for this mimetype') - - def _check_filename(self): - if self.filename[0] is '.': - # TODO: handle dotfiles here - pass - right_to_left_override = u"\u202E" - if right_to_left_override in self.filename: - self.make_dangerous('Filename contains dangerous character') - self.dst_path = self.dst_path.replace(right_to_left_override, '') - # TODO: change self.filename and'filename' property? Or should those reflect the values on the source key - - def check(self): - if self.main_type in Config.ignored_mimes: - self.should_copy = False - self.mime_processing_options.get(self.main_type, self.unknown)() - else: - self._check_dangerous() - self._check_filename() - if self.has_extension: - self._check_extension() - if self.has_mimetype: - self._check_mimetype() - if not self.is_dangerous: - self.mime_processing_options.get(self.main_type, self.unknown)() - - def write_log(self): - props = self.get_all_props() - if not self.is_recursive: - if os.path.exists(self.tempdir_path): - # Hack to make images appear at the correct tree depth in log - self.logger.add_file(self.src_path, props, in_tempdir=True) - return - self.logger.add_file(self.src_path, props) - - # ##### Helper functions ##### - def _make_method_dict(self, list_of_tuples): - """Returns a dictionary with mimetype: method pairs.""" - dict_to_return = {} - for list_of_subtypes, method in list_of_tuples: - for subtype in list_of_subtypes: - dict_to_return[subtype] = method - return dict_to_return - - @property - def has_metadata(self): - """True if filetype typically contains metadata, else False.""" - if self.mimetype in Config.mimes_metadata: - return True - return False - - def make_tempdir(self): - """Make a temporary directory at self.tempdir_path.""" - if not os.path.exists(self.tempdir_path): - os.makedirs(self.tempdir_path) - return self.tempdir_path - - ####################### - # ##### Discarded mimetypes, reason in the docstring ###### - def inode(self): - """Empty file or symlink.""" - if self.is_symlink: - symlink_path = self.get_property('symlink') - self.add_description('File is a symlink to {}'.format(symlink_path)) - else: - self.add_description('File is an inode (empty file)') - - def unknown(self): - """Main type should never be unknown.""" - self.add_description('Unknown mimetype') - - def example(self): - """Used in examples, should never be returned by libmagic.""" - self.add_description('Example file') - - def multipart(self): - """Used in web apps, should never be returned by libmagic""" - self.add_description('Multipart file - usually found in web apps') - - # ##### Treated as malicious, no reason to have it on a USB key ###### - def message(self): - """Process a message file.""" - self.make_dangerous('Message file - should not be found on USB key') - - def model(self): - """Process a model file.""" - self.make_dangerous('Model file - should not be found on USB key') - - # ##### Files that will be converted ###### - def text(self): - """Process an rtf, ooxml, or plaintext file.""" - for mt in Config.mimes_rtf: - if mt in self.sub_type: - self.add_description('Rich Text (rtf) file') - # TODO: need a way to convert it to plain text - self.force_ext('.txt') - return - for mt in Config.mimes_ooxml: - if mt in self.sub_type: - self.add_description('OOXML (openoffice) file') - self._ooxml() - return - self.add_description('Plain text file') - self.force_ext('.txt') - - def application(self): - """Process an application specific file according to its subtype.""" - for subtype, method in self.app_subtype_methods.items(): - if subtype in self.sub_type: - # TODO: should we change the logic so we don't iterate through all of the subtype methods? - # TODO: should these methods return a value? - method() - return - self._unknown_app() - - def _executables(self): - """Process an executable file.""" - # LOG: change the processing_type property to some other name or include in file_string - self.make_dangerous('Executable file') - - def _winoffice(self): - """Process a winoffice file using olefile/oletools.""" - oid = oletools.oleid.OleID(self.src_path) # First assume a valid file - if not olefile.isOleFile(self.src_path): - # Manual processing, may already count as suspicious - try: - ole = olefile.OleFileIO(self.src_path, raise_defects=olefile.DEFECT_INCORRECT) - except: - self.make_dangerous('Unparsable WinOffice file') - if ole.parsing_issues: - self.make_dangerous('Parsing issues with WinOffice file') - else: - if ole.exists('macros/vba') or ole.exists('Macros') \ - or ole.exists('_VBA_PROJECT_CUR') or ole.exists('VBA'): - self.make_dangerous('WinOffice file containing a macro') - else: - indicators = oid.check() - # Encrypted can be set by multiple checks on the script - if oid.encrypted.value: - self.make_dangerous('Encrypted WinOffice file') - if oid.macros.value or oid.ole.exists('macros/vba') or oid.ole.exists('Macros') \ - or oid.ole.exists('_VBA_PROJECT_CUR') or oid.ole.exists('VBA'): - self.make_dangerous('WinOffice file containing a macro') - for i in indicators: - if i.id == 'ObjectPool' and i.value: - # TODO: is having an ObjectPool suspicious? - # LOG: user defined property - self.add_description('WinOffice file containing an object pool') - elif i.id == 'flash' and i.value: - self.make_dangerous('WinOffice file with embedded flash') - self.add_description('WinOffice file') - - def _ooxml(self): - """Process an ooxml file.""" - try: - doc = officedissector.doc.Document(self.src_path) - except Exception: - self.make_dangerous('Invalid ooxml file') - return - # There are probably other potentially malicious features: - # fonts, custom props, custom XML - if doc.is_macro_enabled or len(doc.features.macros) > 0: - self.make_dangerous('Ooxml file containing macro') - if len(doc.features.embedded_controls) > 0: - self.make_dangerous('Ooxml file with activex') - if len(doc.features.embedded_objects) > 0: - # Exploited by CVE-2014-4114 (OLE) - self.make_dangerous('Ooxml file with embedded objects') - if len(doc.features.embedded_packages) > 0: - self.make_dangerous('Ooxml file with embedded packages') - if not self.is_dangerous: - self.add_description('OOXML file') - - def _libreoffice(self): - """Process a libreoffice file.""" - # As long as there is no way to do a sanity check on the files => dangerous - try: - lodoc = zipfile.ZipFile(self.src_path, 'r') - except: - # TODO: are there specific exceptions we should catch here? Or should it be everything - self.make_dangerous('Invalid libreoffice file') - for f in lodoc.infolist(): - fname = f.filename.lower() - if fname.startswith('script') or fname.startswith('basic') or \ - fname.startswith('object') or fname.endswith('.bin'): - self.make_dangerous('Libreoffice file containing executable code') - if not self.is_dangerous: - self.add_description('Libreoffice file') - - def _pdf(self): - """Process a PDF file.""" - xmlDoc = PDFiD(self.src_path) - oPDFiD = cPDFiD(xmlDoc, True) - # TODO: are there other pdf characteristics which should be dangerous? - if oPDFiD.encrypt.count > 0: - self.make_dangerous('Encrypted pdf') - if oPDFiD.js.count > 0 or oPDFiD.javascript.count > 0: - self.make_dangerous('Pdf with embedded javascript') - if oPDFiD.aa.count > 0 or oPDFiD.openaction.count > 0: - self.make_dangerous('Pdf with openaction(s)') - if oPDFiD.richmedia.count > 0: - self.make_dangerous('Pdf containing flash') - if oPDFiD.launch.count > 0: - self.make_dangerous('Pdf with launch action(s)') - if oPDFiD.xfa.count > 0: - self.make_dangerous('Pdf with XFA structures') - if oPDFiD.objstm.count > 0: - self.make_dangerous('Pdf with ObjectStream structures') - if not self.is_dangerous: - self.add_description('Pdf file') - - def _archive(self): - """ - Process an archive using 7zip. - - The archive is extracted to a temporary directory and self.process_dir - is called on that directory. The recursive archive depth is increased - to protect against archive bombs. - """ - # TODO: change this to something archive type specific instead of generic 'Archive' - self.add_description('Archive') - self.should_copy = False - self.is_recursive = True - - def _unknown_app(self): - """Process an unknown file.""" - self.add_description('Unknown application file') - self.make_unknown() - - def _binary_app(self): - """Process an unknown binary file.""" - self.add_description('Unknown binary file') - self.make_binary() - - ####################### - # Metadata extractors - def _metadata_exif(self, metadata_file_path): - """Read exif metadata from a jpg or tiff file using exifread.""" - # TODO: can we shorten this method somehow? - img = open(self.src_path, 'rb') - tags = None - try: - tags = exifread.process_file(img, debug=True) - except Exception as e: - self.add_error(e, "Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path)) - if tags is None: - try: - tags = exifread.process_file(img, debug=True) - except Exception as e: - self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path)) - img.close() - return False - for tag in sorted(tags.keys()): - # These tags are long and obnoxious/binary so we don't add them - if tag not in ('JPEGThumbnail', 'TIFFThumbnail'): - tag_string = str(tags[tag]) - # Exifreader truncates data. - if len(tag_string) > 25 and tag_string.endswith(", ... ]"): - tag_value = tags[tag].values - tag_string = str(tag_value) - with open(metadata_file_path, 'w+') as metadata_file: - metadata_file.write("Key: {}\tValue: {}\n".format(tag, tag_string)) - # TODO: how do we want to log metadata? - self.set_property('metadata', 'exif') - img.close() - return True - - def _metadata_png(self, metadata_file_path): - """Extract metadata from a png file using PIL/Pillow.""" - warnings.simplefilter('error', Image.DecompressionBombWarning) - try: - img = Image.open(self.src_path) - for tag in sorted(img.info.keys()): - # These are long and obnoxious/binary - if tag not in ('icc_profile'): - with open(metadata_file_path, 'w+') as metadata_file: - metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag])) - # LOG: handle metadata - self.set_property('metadata', 'png') - img.close() - except Exception as e: # Catch decompression bombs - # TODO: only catch DecompressionBombWarnings here? - self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path)) - self.make_dangerous('exception processing metadata') - return False - - def extract_metadata(self): - """Create metadata file and call correct metadata extraction method.""" - metadata_file_path = self.create_metadata_file(".metadata.txt") - mt = self.mimetype - metadata_processing_method = self.metadata_mimetype_methods.get(mt) - if metadata_processing_method: - # TODO: should we return metadata and write it here instead of in processing method? - metadata_processing_method(metadata_file_path) - - ####################### - # ##### Media - audio and video aren't converted ###### - def audio(self): - """Process an audio file.""" - self.add_description('Audio file') - self._media_processing() - - def video(self): - """Process a video.""" - self.add_description('Video file') - self._media_processing() - - def _media_processing(self): - """Generic way to process all media files.""" - self.add_description('Media file') - - def image(self): - """ - Process an image. - - Extracts metadata to dest key using self.extract_metada() if metadata - is present. Creates a temporary directory on dest key, opens the image - using PIL.Image, saves it to the temporary directory, and copies it to - the destination. - """ - # TODO: make sure this method works for png, gif, tiff - if self.has_metadata: - self.extract_metadata() - tempdir_path = self.make_tempdir() - tempfile_path = os.path.join(tempdir_path, self.filename) - warnings.simplefilter('error', Image.DecompressionBombWarning) - try: # Do image conversions - img_in = Image.open(self.src_path) - img_out = Image.frombytes(img_in.mode, img_in.size, img_in.tobytes()) - img_out.save(tempfile_path) - self.src_path = tempfile_path - except Exception as e: # Catch decompression bombs - # TODO: change this from all Exceptions to specific DecompressionBombWarning - self.add_error(e, "Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) - self.make_dangerous('Image file containing decompression bomb') - if not self.is_dangerous: - self.add_description('Image file') class GroomerLogger(object): @@ -582,8 +60,7 @@ class GroomerLogger(object): descr_cat = "Normal" else: descr_cat = props['safety_category'].capitalize() - # TODO: make size adjust to MB/GB for large files - size = str(props['file_size']) + 'B' + size = self._convert_size(props['file_size']) file_template = "+- {name} ({sha_hash}): {size}, {mt}/{st}. {desc}: {desc_str}" file_string = file_template.format( name=props['filename'], @@ -598,6 +75,14 @@ class GroomerLogger(object): if in_tempdir: depth -= 1 self._write_line_to_log(file_string, depth) + + def _convert_size(self, size, precision=2): + suffixes=['B','KB','MB','GB'] + suffixIndex = 0 + while size > 1024 and suffixIndex < 4: + suffixIndex += 1 + size = size/1024.0 + return "%.*f%s"%(precision, size, suffixes[suffixIndex]) def add_dir(self, dir_path): path_depth = self._get_path_depth(dir_path) @@ -678,7 +163,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): command_str = '{} -p1 x "{}" -o"{}" -bd -aoa' unpack_command = command_str.format(SEVENZ_PATH, file.src_path, tempdir_path) - self._run_process(unpack_command) + self._run_process(unpack_command, Config.archive_timeout) file.write_log() self.process_dir(tempdir_path, file.dst_path) self.safe_rmtree(tempdir_path) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 998015c..b9b9a08 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -240,6 +240,8 @@ class FileBase(object): if not os.path.exists(dst_path): os.makedirs(dst_path) shutil.copy(src, dst) + if self.main_type == 'text': + os.chmod(self.dst_path, 436) except Exception as e: self.add_error(e, '') diff --git a/scripts/install.sh b/scripts/install.sh new file mode 100644 index 0000000..3e34a5a --- /dev/null +++ b/scripts/install.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# To install python script filecheck.py +# Tested on Raspbian Jessie + + +function installDependencies { + apt-get update + apt-get -y install autoconf libtool python-lxml p7zip-full + apt-get -y install p7zip-full p7zip-rar libxml2-dev libxslt1-dev + pip3 install lxml oletools olefile exifread pillow python-magic + pip3 install git+https://github.com/Rafiot/officedissector.git + + wget https://didierstevens.com/files/software/pdfid_v0_2_1.zip + unzip pdfid_v0_2_1.zip + +} + +function setupPyCIRCLean { + git clone https://github.com/CIRCL/PyCIRCLean + ./PyCIRCLean/setup.py +} + +function installClam { + apt-get install -y clamav + cd usr/local/etc/ + + tail -n +10 clamd.conf.sample > clamd.conf + sed -i -e 's/#LocalSocket/LocalSocket/g' clamd.conf + sed -i -e 's/#DatabaseDirectory/DatabaseDirectory/g' clamd.conf + + tail -n +10 freshclam.conf.sample > freshclam.conf + sed -i -e 's/#DatabaseDirectory/DatabaseDirectory/g' freshclam.conf +} + +function test { + mkdir source + mkdir dest + cp -fr PyCirclean/slides/PyCIRCLean source/. + sudo python3 PyCIRCLean/bin/filecheck.py -s source -d dest + echo Results of the test : + ls dest/ + sudo rm -rf dest/ + sudo rm -rf source/ +} + + +cd ~/Documents +mkdir CIRCLPy +cd CIRCLPy + +sudo su + +installDependencies +installClam +setupPyCIRCLean + +exit +test + + + + + + + +