mirror of https://github.com/CIRCL/PyCIRCLean
				
				
				
			
		
			
				
	
	
		
			836 lines
		
	
	
		
			35 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			836 lines
		
	
	
		
			35 KiB
		
	
	
	
		
			Python
		
	
	
| #!/usr/bin/env python3
 | |
| # -*- coding: utf-8 -*-
 | |
| import os
 | |
| import mimetypes
 | |
| import shlex
 | |
| import subprocess
 | |
| import zipfile
 | |
| import argparse
 | |
| import random
 | |
| import shutil
 | |
| import time
 | |
| import hashlib
 | |
| 
 | |
| import oletools.oleid
 | |
| import olefile
 | |
| import officedissector
 | |
| import warnings
 | |
| import exifread
 | |
| from PIL import Image
 | |
| from pdfid import PDFiD, cPDFiD
 | |
| 
 | |
| from kittengroomer import FileBase, KittenGroomerBase, Logging
 | |
| 
 | |
| 
 | |
| class Config:
 | |
|     """Configuration information for filecheck.py."""
 | |
|     # MIMES
 | |
|     # Application subtypes (mimetype: 'application/<subtype>')
 | |
|     mimes_ooxml = ('vnd.openxmlformats-officedocument.',)
 | |
|     mimes_office = ('msword', 'vnd.ms-',)
 | |
|     mimes_libreoffice = ('vnd.oasis.opendocument',)
 | |
|     mimes_rtf = ('rtf', 'richtext',)
 | |
|     mimes_pdf = ('pdf', 'postscript',)
 | |
|     mimes_xml = ('xml',)
 | |
|     mimes_ms = ('dosexec',)
 | |
|     mimes_compressed = ('zip', 'rar', 'x-rar', 'bzip2', 'lzip', 'lzma', 'lzop',
 | |
|                         'xz', 'compress', 'gzip', 'tar',)
 | |
|     mimes_data = ('octet-stream',)
 | |
|     mimes_audio = ('ogg',)
 | |
| 
 | |
|     # Image subtypes
 | |
|     mimes_exif = ('image/jpeg', 'image/tiff',)
 | |
|     mimes_png = ('image/png',)
 | |
| 
 | |
|     # Mimetypes with metadata
 | |
|     mimes_metadata = ('image/jpeg', 'image/tiff', 'image/png',)
 | |
| 
 | |
|     # Mimetype aliases
 | |
|     aliases = {
 | |
|         # Win executables
 | |
|         'application/x-msdos-program': 'application/x-dosexec',
 | |
|         'application/x-dosexec': 'application/x-msdos-program',
 | |
|         # Other apps with confusing mimetypes
 | |
|         'application/rtf': 'text/rtf',
 | |
|         'application/rar': 'application/x-rar',
 | |
|         'application/ogg': 'audio/ogg',
 | |
|         'audio/ogg': 'application/ogg'
 | |
|     }
 | |
| 
 | |
|     # EXTS
 | |
|     # Commonly used malicious extensions
 | |
|     # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
 | |
|     # https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java
 | |
|     malicious_exts = (
 | |
|         # Applications
 | |
|         ".exe", ".pif", ".application", ".gadget", ".msi", ".msp", ".com", ".scr",
 | |
|         ".hta", ".cpl", ".msc", ".jar",
 | |
|         # Scripts
 | |
|         ".bat", ".cmd", ".vb", ".vbs", ".vbe", ".js", ".jse", ".ws", ".wsf",
 | |
|         ".wsc", ".wsh", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2",
 | |
|         ".msh", ".msh1", ".msh2", ".mshxml", ".msh1xml", ".msh2xml",
 | |
|         # Shortcuts
 | |
|         ".scf", ".lnk", ".inf",
 | |
|         # Other
 | |
|         ".reg", ".dll",
 | |
|         # Office macro (OOXML with macro enabled)
 | |
|         ".docm", ".dotm", ".xlsm", ".xltm", ".xlam", ".pptm", ".potm", ".ppam",
 | |
|         ".ppsm", ".sldm",
 | |
|         # banned from wirecode
 | |
|         ".asf", ".asx", ".au", ".htm", ".html", ".mht", ".vbs",
 | |
|         ".wax", ".wm", ".wma", ".wmd", ".wmv", ".wmx", ".wmz", ".wvx",
 | |
|         # Google chrome malicious extensions
 | |
|         ".ad", ".ade", ".adp", ".ah", ".apk", ".app", ".application", ".asp",
 | |
|         ".asx", ".bas", ".bash", ".bat", ".cfg", ".chi", ".chm", ".class",
 | |
|         ".cmd", ".com", ".command", ".crt", ".crx", ".csh", ".deb", ".dex",
 | |
|         ".dll", ".drv", ".exe", ".fxp", ".grp", ".hlp", ".hta", ".htm", ".html",
 | |
|         ".htt", ".inf", ".ini", ".ins", ".isp", ".jar", ".jnlp", ".user.js",
 | |
|         ".js", ".jse", ".ksh", ".lnk", ".local", ".mad", ".maf", ".mag", ".mam",
 | |
|         ".manifest", ".maq", ".mar", ".mas", ".mat", ".mau", ".mav", ".maw",
 | |
|         ".mda", ".mdb", ".mde", ".mdt", ".mdw", ".mdz", ".mht", ".mhtml", ".mmc",
 | |
|         ".mof", ".msc", ".msh", ".mshxml", ".msi", ".msp", ".mst", ".ocx", ".ops",
 | |
|         ".pcd", ".pif", ".pkg", ".pl", ".plg", ".prf", ".prg", ".pst", ".py",
 | |
|         ".pyc", ".pyw", ".rb", ".reg", ".rpm", ".scf", ".scr", ".sct", ".sh",
 | |
|         ".shar", ".shb", ".shs", ".shtm", ".shtml", ".spl", ".svg", ".swf", ".sys",
 | |
|         ".tcsh", ".url", ".vb", ".vbe", ".vbs", ".vsd", ".vsmacros", ".vss",
 | |
|         ".vst", ".vsw", ".ws", ".wsc", ".wsf", ".wsh", ".xbap", ".xht", ".xhtm",
 | |
|         ".xhtml", ".xml", ".xsl", ".xslt", ".website", ".msh1", ".msh2", ".msh1xml",
 | |
|         ".msh2xml", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2", ".xnk",
 | |
|         ".appref-ms", ".gadget", ".efi", ".fon", ".partial", ".svg", ".xml",
 | |
|         ".xrm_ms", ".xsl", ".action", ".bin", ".inx", ".ipa", ".isu", ".job",
 | |
|         ".out", ".pad", ".paf", ".rgs", ".u3p", ".vbscript", ".workflow", ".001",
 | |
|         ".ace", ".arc", ".arj", ".b64", ".balz", ".bhx", ".cab", ".cpio", ".fat",
 | |
|         ".hfs", ".hqx", ".iso", ".lha", ".lpaq1", ".lpaq5", ".lpaq8", ".lzh",
 | |
|         ".mim", ".ntfs", ".paq8f", ".paq8jd", ".paq8l", ".paq8o", ".pea", ".quad",
 | |
|         ".r00", ".r01", ".r02", ".r03", ".r04", ".r05", ".r06", ".r07", ".r08",
 | |
|         ".r09", ".r10", ".r11", ".r12", ".r13", ".r14", ".r15", ".r16", ".r17",
 | |
|         ".r18", ".r19", ".r20", ".r21", ".r22", ".r23", ".r24", ".r25", ".r26",
 | |
|         ".r27", ".r28", ".r29", ".squashfs", ".swm", ".tpz", ".txz", ".tz", ".udf",
 | |
|         ".uu", ".uue", ".vhd", ".vmdk", ".wim", ".wrc", ".xar", ".xxe", ".z",
 | |
|         ".zipx", ".zpaq", ".cdr", ".dart", ".dc42", ".diskcopy42", ".dmg",
 | |
|         ".dmgpart", ".dvdr", ".img", ".imgpart", ".ndif", ".smi", ".sparsebundle",
 | |
|         ".sparseimage", ".toast", ".udif",
 | |
|     )
 | |
| 
 | |
|     # Sometimes, mimetypes.guess_type gives unexpected results, such as for .tar.gz files:
 | |
|     # In [12]: mimetypes.guess_type('toot.tar.gz', strict=False)
 | |
|     # Out[12]: ('application/x-tar', 'gzip')
 | |
|     # It works as expected if you do mimetypes.guess_type('application/gzip', strict=False)
 | |
|     override_ext = {'.gz': 'application/gzip'}
 | |
| 
 | |
| 
 | |
| SEVENZ_PATH = '/usr/bin/7z'
 | |
| 
 | |
| 
 | |
| class File(FileBase):
 | |
|     """
 | |
|     Main file object
 | |
| 
 | |
|     Created for each file that is processed by KittenGroomer. Contains all
 | |
|     filetype-specific processing methods.
 | |
|     """
 | |
| 
 | |
|     def __init__(self, src_path, dst_path):
 | |
|         super(File, self).__init__(src_path, dst_path)
 | |
|         self.is_archive = False
 | |
|         self.tempdir_path = self.dst_path + '_temp'
 | |
| 
 | |
|         subtypes_apps = (
 | |
|             (Config.mimes_office, self._winoffice),
 | |
|             (Config.mimes_ooxml, self._ooxml),
 | |
|             (Config.mimes_rtf, self.text),
 | |
|             (Config.mimes_libreoffice, self._libreoffice),
 | |
|             (Config.mimes_pdf, self._pdf),
 | |
|             (Config.mimes_xml, self.text),
 | |
|             (Config.mimes_ms, self._executables),
 | |
|             (Config.mimes_compressed, self._archive),
 | |
|             (Config.mimes_data, self._binary_app),
 | |
|             (Config.mimes_audio, self.audio)
 | |
|         )
 | |
|         self.app_subtype_methods = self._make_method_dict(subtypes_apps)
 | |
| 
 | |
|         types_metadata = (
 | |
|             (Config.mimes_exif, self._metadata_exif),
 | |
|             (Config.mimes_png, self._metadata_png),
 | |
|         )
 | |
|         self.metadata_mimetype_methods = self._make_method_dict(types_metadata)
 | |
| 
 | |
|         self.mime_processing_options = {
 | |
|             'text': self.text,
 | |
|             'audio': self.audio,
 | |
|             'image': self.image,
 | |
|             'video': self.video,
 | |
|             'application': self.application,
 | |
|             'example': self.example,
 | |
|             'message': self.message,
 | |
|             'model': self.model,
 | |
|             'multipart': self.multipart,
 | |
|             'inode': self.inode,
 | |
|         }
 | |
| 
 | |
|     def __repr__(self):
 | |
|         return "<filecheck.File object: {{{}}}>".format(self.filename)
 | |
| 
 | |
|     def _check_extension(self):
 | |
|         """
 | |
|         Guess the file's mimetype based on its extension.
 | |
| 
 | |
|         If the file's mimetype (as determined by libmagic) is contained in
 | |
|         the `mimetype` module's list of valid mimetypes and the expected
 | |
|         mimetype based on its extension differs from the mimetype determined
 | |
|         by libmagic, then mark the file as dangerous.
 | |
|         """
 | |
|         if not self.has_extension:
 | |
|             self.make_dangerous('File has no extension')
 | |
|         else:
 | |
|             if self.extension in Config.override_ext:
 | |
|                 expected_mimetypes = Config.override_ext[self.extension]
 | |
|             else:
 | |
|                 expected_mimetype, encoding = mimetypes.guess_type(self.src_path,
 | |
|                                                                    strict=False)
 | |
| 
 | |
|                 expected_mimetypes = [expected_mimetype]
 | |
|                 if expected_mimetype in Config.aliases:
 | |
|                     expected_mimetypes.append(Config.aliases[expected_mimetype])
 | |
|             if (encoding is None) and (os.path.getsize(self.src_path) == 0):
 | |
|                 is_empty_file = True
 | |
|             else:
 | |
|                 is_empty_file = False
 | |
| 
 | |
|             is_known_extension = self.extension in mimetypes.types_map.keys()
 | |
|             if is_known_extension and self.mimetype not in expected_mimetypes and not is_empty_file:
 | |
|                 self.make_dangerous('Mimetype does not match expected mimetypes ({}) for this extension'.format(expected_mimetypes))
 | |
| 
 | |
|     def _check_mimetype(self):
 | |
|         """
 | |
|         Compare mimetype (as determined by libmagic) to extension.
 | |
| 
 | |
|         Determine whether the extension that are normally associated with
 | |
|         the mimetype include the file's actual extension.
 | |
|         """
 | |
|         if not self.has_mimetype:
 | |
|             self.make_dangerous('File has no mimetype')
 | |
|         else:
 | |
|             if self.mimetype in Config.aliases:
 | |
|                 mimetype = Config.aliases[self.mimetype]
 | |
|             else:
 | |
|                 mimetype = self.mimetype
 | |
|             expected_extensions = mimetypes.guess_all_extensions(mimetype,
 | |
|                                                                  strict=False)
 | |
|             if mimetype in Config.aliases:
 | |
|                 expected_extensions += mimetypes.guess_all_extensions(Config.aliases[mimetype], strict=False)
 | |
|             if expected_extensions:
 | |
|                 if self.has_extension and self.extension not in expected_extensions:
 | |
|                     self.make_dangerous('Extension does not match expected extensions ({}) for this mimetype'.format(expected_extensions))
 | |
| 
 | |
|     def _check_filename(self):
 | |
|         """
 | |
|         Verify the filename
 | |
| 
 | |
|         If the filename contains any dangerous or specific characters, handle
 | |
|         them appropriately.
 | |
|         """
 | |
|         if self.filename.startswith('.'):
 | |
|             macos_hidden_files = set(
 | |
|                 '.Trashes', '._.Trashes', '.DS_Store', '.fseventsd', '.Spotlight-V100'
 | |
|             )
 | |
|             if self.filename in macos_hidden_files:
 | |
|                 self.add_description('MacOS metadata file, added by MacOS to USB drives and some directories')
 | |
|                 self.should_copy = False
 | |
|         right_to_left_override = u"\u202E"
 | |
|         if right_to_left_override in self.filename:
 | |
|             self.make_dangerous('Filename contains dangerous character')
 | |
|             new_filename = self.filename.replace(right_to_left_override, '')
 | |
|             self.set_property('filename', new_filename)
 | |
| 
 | |
|     def _check_malicious_exts(self):
 | |
|         """Check that the file's extension isn't contained in a blacklist"""
 | |
|         if self.extension in Config.malicious_exts:
 | |
|             self.make_dangerous('Extension identifies file as potentially dangerous')
 | |
| 
 | |
|     def _compute_random_hashes(self):
 | |
|         """Compute a random amount of hashes at random positions in the file to ensure integrity after the copy (mitigate TOCTOU attacks)"""
 | |
|         if not os.path.exists(self.src_path) or os.path.isdir(self.src_path) or self.maintype == 'image':
 | |
|             # Images are converted, no need to compute the hashes
 | |
|             return
 | |
|         self.random_hashes = []
 | |
|         if self.size < 64:
 | |
|             # hash the whole file
 | |
|             self.block_length = self.size
 | |
|         else:
 | |
|             if self.size < 128:
 | |
|                 # Get a random length between 16 and the size of the file
 | |
|                 self.block_length = random.randint(16, self.size)
 | |
|             else:
 | |
|                 # Get a random length between 16 and 128
 | |
|                 self.block_length = random.randint(16, 128)
 | |
| 
 | |
|         for i in range(random.randint(3, 6)):  # Do a random amound of read on the file (between 5 and 10)
 | |
|             start_pos = random.randint(0, self.size - self.block_length)  # Pick a random length for the hash to compute
 | |
|             with open(self.src_path, 'rb') as f:
 | |
|                 f.seek(start_pos)
 | |
|                 hashed = hashlib.sha256(f.read(self.block_length)).hexdigest()
 | |
|                 self.random_hashes.append((start_pos, hashed))
 | |
|                 time.sleep(random.uniform(0.1, 0.5))  # Add a random sleep length
 | |
| 
 | |
|     def _validate_random_hashes(self):
 | |
|         """Validate hashes computed by _compute_random_hashes"""
 | |
|         if not os.path.exists(self.src_path) or os.path.isdir(self.src_path) or self.maintype == 'image':
 | |
|             # Images are converted, we don't have to fear TOCTOU
 | |
|             return True
 | |
|         for start_pos, hashed_src in self.random_hashes:
 | |
|             with open(self.dst_path, 'rb') as f:
 | |
|                 f.seek(start_pos)
 | |
|                 hashed = hashlib.sha256(f.read(self.block_length)).hexdigest()
 | |
|                 if hashed != hashed_src:
 | |
|                     # Something fucked up happened
 | |
|                     return False
 | |
|         return True
 | |
| 
 | |
|     def check(self):
 | |
|         """
 | |
|         Main file processing method.
 | |
| 
 | |
|         First, checks for basic properties that might indicate a dangerous file.
 | |
|         If the file isn't dangerous, then delegates to various helper methods
 | |
|         for filetype-specific checks based on the file's mimetype.
 | |
|         """
 | |
|         # Any of these methods can call make_dangerous():
 | |
|         self._check_malicious_exts()
 | |
|         self._check_mimetype()
 | |
|         self._check_extension()
 | |
|         self._check_filename()  # can mutate self.filename
 | |
|         self._compute_random_hashes()
 | |
| 
 | |
|         if not self.is_dangerous:
 | |
|             self.mime_processing_options.get(self.maintype, self.unknown)()
 | |
| 
 | |
|     # ##### Helper functions #####
 | |
|     def _make_method_dict(self, list_of_tuples):
 | |
|         """Returns a dictionary with mimetype: method pairs."""
 | |
|         dict_to_return = {}
 | |
|         for list_of_subtypes, method in list_of_tuples:
 | |
|             for subtype in list_of_subtypes:
 | |
|                 dict_to_return[subtype] = method
 | |
|         return dict_to_return
 | |
| 
 | |
|     @property
 | |
|     def has_metadata(self):
 | |
|         """True if filetype typically contains metadata, else False."""
 | |
|         if self.mimetype in Config.mimes_metadata:
 | |
|             return True
 | |
|         return False
 | |
| 
 | |
|     def make_tempdir(self):
 | |
|         """Make a temporary directory at self.tempdir_path."""
 | |
|         if not os.path.exists(self.tempdir_path):
 | |
|             os.makedirs(self.tempdir_path)
 | |
|         return self.tempdir_path
 | |
| 
 | |
|     #######################
 | |
|     # ##### Discarded mimetypes, reason in the docstring ######
 | |
|     def inode(self):
 | |
|         """Empty file or symlink."""
 | |
|         if self.is_symlink:
 | |
|             symlink_path = self.get_property('symlink')
 | |
|             self.add_description('File is a symlink to {}'.format(symlink_path))
 | |
|         else:
 | |
|             self.add_description('File is an inode (empty file)')
 | |
|         self.should_copy = False
 | |
| 
 | |
|     def unknown(self):
 | |
|         """Main type should never be unknown."""
 | |
|         self.add_description('Unknown mimetype')
 | |
|         self.should_copy = False
 | |
| 
 | |
|     def example(self):
 | |
|         """Used in examples, should never be returned by libmagic."""
 | |
|         self.add_description('Example file')
 | |
|         self.should_copy = False
 | |
| 
 | |
|     def multipart(self):
 | |
|         """Used in web apps, should never be returned by libmagic"""
 | |
|         self.add_description('Multipart file - usually found in web apps')
 | |
|         self.should_copy = False
 | |
| 
 | |
|     # ##### Treated as malicious, no reason to have it on a USB key ######
 | |
|     def message(self):
 | |
|         """Process a message file."""
 | |
|         self.make_dangerous('Message file - should not be found on USB key')
 | |
| 
 | |
|     def model(self):
 | |
|         """Process a model file."""
 | |
|         self.make_dangerous('Model file - should not be found on USB key')
 | |
| 
 | |
|     # ##### Files that will be converted ######
 | |
|     def text(self):
 | |
|         """Process an rtf, ooxml, or plaintext file."""
 | |
|         for mt in Config.mimes_rtf:
 | |
|             if mt in self.subtype:
 | |
|                 self.add_description('Rich Text (rtf) file')
 | |
|                 self.force_ext('.txt')
 | |
|                 return
 | |
|         for mt in Config.mimes_ooxml:
 | |
|             if mt in self.subtype:
 | |
|                 self._ooxml()
 | |
|                 return
 | |
|         self.add_description('Plain text file')
 | |
|         self.force_ext('.txt')
 | |
| 
 | |
|     def application(self):
 | |
|         """Process an application specific file according to its subtype."""
 | |
|         for subtype, method in self.app_subtype_methods.items():
 | |
|             if subtype in self.subtype:  # checking for partial matches
 | |
|                 method()
 | |
|                 return
 | |
|         self._unknown_app()  # if none of the methods match
 | |
| 
 | |
|     def _executables(self):
 | |
|         """Process an executable file."""
 | |
|         self.make_dangerous('Executable file')
 | |
| 
 | |
|     def _winoffice(self):
 | |
|         """Process a winoffice file using olefile/oletools."""
 | |
|         oid = oletools.oleid.OleID(self.src_path)  # First assume a valid file
 | |
|         if not olefile.isOleFile(self.src_path):
 | |
|             # Manual processing, may already count as suspicious
 | |
|             try:
 | |
|                 ole = olefile.OleFileIO(self.src_path, raise_defects=olefile.DEFECT_INCORRECT)
 | |
|             except Exception:
 | |
|                 self.make_dangerous('Unparsable WinOffice file')
 | |
|             if ole.parsing_issues:
 | |
|                 self.make_dangerous('Parsing issues with WinOffice file')
 | |
|             else:
 | |
|                 if ole.exists('macros/vba') or ole.exists('Macros') \
 | |
|                         or ole.exists('_VBA_PROJECT_CUR') or ole.exists('VBA'):
 | |
|                     self.make_dangerous('WinOffice file containing a macro')
 | |
|         else:
 | |
|             indicators = oid.check()
 | |
|             for i in indicators:
 | |
|                 if i.id == 'ObjectPool' and i.value:
 | |
|                     self.make_dangerous('WinOffice file containing an object pool')
 | |
|                 elif i.id == 'flash' and i.value:
 | |
|                     self.make_dangerous('WinOffice file with embedded flash')
 | |
|                 elif i.id == 'encrypted' and i.value:
 | |
|                     self.make_dangerous('Encrypted WinOffice file')
 | |
|                 elif i.id == 'vba_macros' and i.value:
 | |
|                     self.make_dangerous('WinOffice file containing a macro')
 | |
| 
 | |
|         self.add_description('WinOffice file')
 | |
| 
 | |
|     def _ooxml(self):
 | |
|         """Process an ooxml file."""
 | |
|         self.add_description('OOXML (openoffice) file')
 | |
|         try:
 | |
|             doc = officedissector.doc.Document(self.src_path)
 | |
|         except Exception:
 | |
|             self.make_dangerous('Invalid ooxml file')
 | |
|             return
 | |
|         # There are probably other potentially malicious features:
 | |
|         # fonts, custom props, custom XML
 | |
|         if doc.is_macro_enabled or len(doc.features.macros) > 0:
 | |
|             self.make_dangerous('Ooxml file containing macro')
 | |
|         if len(doc.features.embedded_controls) > 0:
 | |
|             self.make_dangerous('Ooxml file with activex')
 | |
|         if len(doc.features.embedded_objects) > 0:
 | |
|             # Exploited by CVE-2014-4114 (OLE)
 | |
|             self.make_dangerous('Ooxml file with embedded objects')
 | |
|         if len(doc.features.embedded_packages) > 0:
 | |
|             self.make_dangerous('Ooxml file with embedded packages')
 | |
| 
 | |
|     def _libreoffice(self):
 | |
|         """Process a libreoffice file."""
 | |
|         # As long as there is no way to do a sanity check on the files => dangerous
 | |
|         try:
 | |
|             lodoc = zipfile.ZipFile(self.src_path, 'r')
 | |
|         except Exception:
 | |
|             # TODO: are there specific exceptions we should catch here? Or should it be everything
 | |
|             self.make_dangerous('Invalid libreoffice file')
 | |
|         for f in lodoc.infolist():
 | |
|             fname = f.filename.lower()
 | |
|             if fname.startswith('script') or fname.startswith('basic') or \
 | |
|                     fname.startswith('object') or fname.endswith('.bin'):
 | |
|                 self.make_dangerous('Libreoffice file containing executable code')
 | |
|         if not self.is_dangerous:
 | |
|             self.add_description('Libreoffice file')
 | |
| 
 | |
|     def _pdf(self):
 | |
|         """Process a PDF file."""
 | |
|         xmlDoc = PDFiD(self.src_path)
 | |
|         oPDFiD = cPDFiD(xmlDoc, True)
 | |
|         if oPDFiD.encrypt.count > 0:
 | |
|             self.make_dangerous('Encrypted pdf')
 | |
|         if oPDFiD.js.count > 0 or oPDFiD.javascript.count > 0:
 | |
|             self.make_dangerous('Pdf with embedded javascript')
 | |
|         if oPDFiD.aa.count > 0 or oPDFiD.openaction.count > 0:
 | |
|             self.make_dangerous('Pdf with openaction(s)')
 | |
|         if oPDFiD.richmedia.count > 0:
 | |
|             self.make_dangerous('Pdf containing flash')
 | |
|         if oPDFiD.launch.count > 0:
 | |
|             self.make_dangerous('Pdf with launch action(s)')
 | |
|         if oPDFiD.xfa.count > 0:
 | |
|             self.make_dangerous('Pdf with XFA structures')
 | |
|         if oPDFiD.objstm.count > 0:
 | |
|             self.make_dangerous('Pdf with ObjectStream structures')
 | |
|         if not self.is_dangerous:
 | |
|             self.add_description('Pdf file')
 | |
| 
 | |
|     def _archive(self):
 | |
|         """
 | |
|         Process an archive using 7zip.
 | |
| 
 | |
|         The archive is extracted to a temporary directory and self.process_dir
 | |
|         is called on that directory. The recursive archive depth is increased
 | |
|         to protect against archive bombs.
 | |
|         """
 | |
|         # TODO: change this to something archive type specific instead of generic 'Archive'
 | |
|         self.add_description('Archive')
 | |
|         self.should_copy = False
 | |
|         self.is_archive = True
 | |
| 
 | |
|     def _unknown_app(self):
 | |
|         """Process an unknown file."""
 | |
|         self.make_dangerous('Unknown application file')
 | |
| 
 | |
|     def _binary_app(self):
 | |
|         """Process an unknown binary file."""
 | |
|         self.make_dangerous('Unknown binary file')
 | |
| 
 | |
|     #######################
 | |
|     # Metadata extractors
 | |
|     def _metadata_exif(self, metadata_file_path):
 | |
|         """Read exif metadata from a jpg or tiff file using exifread."""
 | |
|         # TODO: can we shorten this method somehow?
 | |
|         with open(self.src_path, 'rb') as img:
 | |
|             tags = None
 | |
|             try:
 | |
|                 tags = exifread.process_file(img, debug=True)
 | |
|             except Exception as e:
 | |
|                 self.add_error(e, "Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path))
 | |
|             if tags is None:
 | |
|                 try:
 | |
|                     tags = exifread.process_file(img, debug=True)
 | |
|                 except Exception as e:
 | |
|                     self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path))
 | |
|                     return False
 | |
|             for tag in sorted(tags.keys()):
 | |
|                 # These tags are long and obnoxious/binary so we don't add them
 | |
|                 if tag not in ('JPEGThumbnail', 'TIFFThumbnail'):
 | |
|                     tag_string = str(tags[tag])
 | |
|                     # Exifreader truncates data.
 | |
|                     if len(tag_string) > 25 and tag_string.endswith(", ... ]"):
 | |
|                         tag_value = tags[tag].values
 | |
|                         tag_string = str(tag_value)
 | |
|                     with open(metadata_file_path, 'w+') as metadata_file:
 | |
|                         metadata_file.write("Key: {}\tValue: {}\n".format(tag, tag_string))
 | |
|             # TODO: how do we want to log metadata?
 | |
|             self.set_property('metadata', 'exif')
 | |
|         return True
 | |
| 
 | |
|     def _metadata_png(self, metadata_file_path):
 | |
|         """Extract metadata from a png file using PIL/Pillow."""
 | |
|         warnings.simplefilter('error', Image.DecompressionBombWarning)
 | |
|         try:
 | |
|             with Image.open(self.src_path) as img:
 | |
|                 for tag in sorted(img.info.keys()):
 | |
|                     # These are long and obnoxious/binary
 | |
|                     if tag not in ('icc_profile'):
 | |
|                         with open(metadata_file_path, 'w+') as metadata_file:
 | |
|                             metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag]))
 | |
|                 # LOG: handle metadata
 | |
|                 self.set_property('metadata', 'png')
 | |
|         except Exception as e:  # Catch decompression bombs
 | |
|             # TODO: only catch DecompressionBombWarnings here?
 | |
|             self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path))
 | |
|             self.make_dangerous('exception processing metadata')
 | |
|             return False
 | |
| 
 | |
|     def extract_metadata(self):
 | |
|         """Create metadata file and call correct metadata extraction method."""
 | |
|         metadata_file_path = self.create_metadata_file(".metadata.txt")
 | |
|         mt = self.mimetype
 | |
|         metadata_processing_method = self.metadata_mimetype_methods.get(mt)
 | |
|         if metadata_processing_method:
 | |
|             # TODO: should we return metadata and write it here instead of in processing method?
 | |
|             metadata_processing_method(metadata_file_path)
 | |
| 
 | |
|     #######################
 | |
|     # ##### Media - audio and video aren't converted ######
 | |
|     def audio(self):
 | |
|         """Process an audio file."""
 | |
|         self.add_description('Audio file')
 | |
|         self._media_processing()
 | |
| 
 | |
|     def video(self):
 | |
|         """Process a video."""
 | |
|         self.add_description('Video file')
 | |
|         self._media_processing()
 | |
| 
 | |
|     def _media_processing(self):
 | |
|         """Generic way to process all media files."""
 | |
|         self.add_description('Media file')
 | |
| 
 | |
|     def image(self):
 | |
|         """
 | |
|         Process an image.
 | |
| 
 | |
|         Extracts metadata to dest key using self.extract_metada() if metadata
 | |
|         is present. Creates a temporary directory on dest key, opens the image
 | |
|         using PIL.Image, saves it to the temporary directory, and copies it to
 | |
|         the destination.
 | |
|         """
 | |
|         if self.has_metadata:
 | |
|             self.extract_metadata()
 | |
|         tempdir_path = self.make_tempdir()
 | |
|         tempfile_path = os.path.join(tempdir_path, self.filename)
 | |
|         warnings.simplefilter('error', Image.DecompressionBombWarning)
 | |
|         try:  # Do image conversions
 | |
|             with Image.open(self.src_path) as img_in:
 | |
|                 with Image.frombytes(img_in.mode, img_in.size, img_in.tobytes()) as img_out:
 | |
|                     img_out.save(tempfile_path)
 | |
|                 self.src_path = tempfile_path
 | |
|         except Exception as e:  # Catch decompression bombs
 | |
|             # TODO: change this from all Exceptions to specific DecompressionBombWarning
 | |
|             self.add_error(e, "Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path))
 | |
|             self.make_dangerous('Image file containing decompression bomb')
 | |
|         if not self.is_dangerous:
 | |
|             self.add_description('Image file')
 | |
| 
 | |
| 
 | |
| class GroomerLogger(object):
 | |
|     """Groomer logging interface."""
 | |
| 
 | |
|     def __init__(self, src_root_path, dst_root_path, debug=False):
 | |
|         self._src_root_path = src_root_path
 | |
|         self._dst_root_path = dst_root_path
 | |
|         self._log_dir_path = self._make_log_dir(dst_root_path)
 | |
|         self.log_path = os.path.join(self._log_dir_path, 'circlean_log.txt')
 | |
|         self._add_root_dir(src_root_path)
 | |
|         if debug:
 | |
|             self.log_debug_err = os.path.join(self._log_dir_path, 'debug_stderr.log')
 | |
|             self.log_debug_out = os.path.join(self._log_dir_path, 'debug_stdout.log')
 | |
|         else:
 | |
|             self.log_debug_err = os.devnull
 | |
|             self.log_debug_out = os.devnull
 | |
| 
 | |
|     def _make_log_dir(self, root_dir_path):
 | |
|         """Create the directory in the dest dir that will hold the logs"""
 | |
|         log_dir_path = os.path.join(root_dir_path, 'logs')
 | |
|         if os.path.exists(log_dir_path):
 | |
|             shutil.rmtree(log_dir_path)
 | |
|         os.makedirs(log_dir_path)
 | |
|         return log_dir_path
 | |
| 
 | |
|     def _add_root_dir(self, root_path):
 | |
|         """Add the root directory to the log"""
 | |
|         dirname = os.path.split(root_path)[1] + '/'
 | |
|         with open(self.log_path, mode='ab') as lf:
 | |
|             lf.write(bytes(dirname, 'utf-8'))
 | |
|             lf.write(b'\n')
 | |
| 
 | |
|     def add_file(self, file_path, file_props, in_tempdir=False):
 | |
|         """Add a file to the log. Takes a path and a dict of file properties."""
 | |
|         depth = self._get_path_depth(file_path)
 | |
|         try:
 | |
|             file_hash = Logging.computehash(file_path)[:6]
 | |
|         except IsADirectoryError:
 | |
|             file_hash = 'directory'
 | |
|         except FileNotFoundError:
 | |
|             file_hash = '------'
 | |
|         if file_props['is_symlink']:
 | |
|             symlink_template = "+- NOT COPIED: symbolic link to {name} ({sha_hash})"
 | |
|             log_string = symlink_template.format(
 | |
|                 name=file_props['symlink_path'],
 | |
|                 sha_hash=file_hash
 | |
|             )
 | |
|         else:
 | |
|             if file_props['is_dangerous']:
 | |
|                 category = "Dangerous"
 | |
|             else:
 | |
|                 category = "Normal"
 | |
|             size_string = self._format_file_size(file_props['file_size'])
 | |
|             if not file_props['copied']:
 | |
|                 copied_string = 'NOT COPIED: '
 | |
|             else:
 | |
|                 copied_string = ''
 | |
|             file_template = "+- {copied}{name} ({sha_hash}): {size}, type: {mt}/{st}. {cat}: {desc_str}"
 | |
|             log_string = file_template.format(
 | |
|                 copied=copied_string,
 | |
|                 name=file_props['filename'],
 | |
|                 sha_hash=file_hash,
 | |
|                 size=size_string,
 | |
|                 mt=file_props['maintype'],
 | |
|                 st=file_props['subtype'],
 | |
|                 cat=category,
 | |
|                 desc_str=file_props['description_string'],
 | |
|             )
 | |
|         if file_props['errors']:
 | |
|             error_string = ', '.join([str(key) for key in file_props['errors']])
 | |
|             log_string += (' Errors: ' + error_string)
 | |
|         if in_tempdir:
 | |
|             depth -= 1
 | |
|         self._write_line_to_log(log_string, depth)
 | |
| 
 | |
|     def add_dir(self, dir_path):
 | |
|         """Add a directory to the log"""
 | |
|         path_depth = self._get_path_depth(dir_path)
 | |
|         dirname = os.path.split(dir_path)[1] + '/'
 | |
|         log_line = '+- ' + dirname
 | |
|         self._write_line_to_log(log_line, path_depth)
 | |
| 
 | |
|     def _format_file_size(self, size):
 | |
|         """Returns a string with the file size and appropriate unit"""
 | |
|         file_size = size
 | |
|         for unit in ('B', 'KB', 'MB', 'GB'):
 | |
|             if file_size < 1024:
 | |
|                 return str(int(file_size)) + unit
 | |
|             else:
 | |
|                 file_size = file_size / 1024
 | |
|         return str(int(file_size)) + 'GB'
 | |
| 
 | |
|     def _get_path_depth(self, path):
 | |
|         """Returns the relative path depth compared to root directory"""
 | |
|         if self._dst_root_path in path:
 | |
|             base_path = self._dst_root_path
 | |
|         elif self._src_root_path in path:
 | |
|             base_path = self._src_root_path
 | |
|         relpath = os.path.relpath(path, base_path)
 | |
|         path_depth = relpath.count(os.path.sep)
 | |
|         return path_depth
 | |
| 
 | |
|     def _write_line_to_log(self, line, indentation_depth):
 | |
|         """
 | |
|         Write a line to the log
 | |
| 
 | |
|         Pad the line according to the `indentation_depth`.
 | |
|         """
 | |
|         padding = b'   '
 | |
|         padding += b'|  ' * indentation_depth
 | |
|         line_bytes = os.fsencode(line)
 | |
|         with open(self.log_path, mode='ab') as lf:
 | |
|             lf.write(padding)
 | |
|             lf.write(line_bytes)
 | |
|             lf.write(b'\n')
 | |
| 
 | |
| 
 | |
| class KittenGroomerFileCheck(KittenGroomerBase):
 | |
| 
 | |
|     def __init__(self, root_src, root_dst, max_recursive_depth=2, debug=False):
 | |
|         super(KittenGroomerFileCheck, self).__init__(root_src, root_dst)
 | |
|         self.recursive_archive_depth = 0
 | |
|         self.max_recursive_depth = max_recursive_depth
 | |
|         self.logger = GroomerLogger(root_src, root_dst, debug)
 | |
| 
 | |
|     def __repr__(self):
 | |
|         return "filecheck.KittenGroomerFileCheck object: {{{}}}".format(
 | |
|             os.path.basename(self.src_root_path)
 | |
|         )
 | |
| 
 | |
|     def process_dir(self, src_dir, dst_dir):
 | |
|         """Process a directory on the source key."""
 | |
|         for srcpath in self.list_files_dirs(src_dir):
 | |
|             if not os.path.islink(srcpath) and os.path.isdir(srcpath):
 | |
|                 self.logger.add_dir(srcpath)
 | |
|             else:
 | |
|                 dstpath = os.path.join(dst_dir, os.path.basename(srcpath))
 | |
|                 cur_file = File(srcpath, dstpath)
 | |
|                 self.process_file(cur_file)
 | |
| 
 | |
|     def process_file(self, file):
 | |
|         """
 | |
|         Process an individual file.
 | |
| 
 | |
|         Check the file, handle archives using self.process_archive, copy
 | |
|         the file to the destionation key, and clean up temporary directory.
 | |
|         """
 | |
|         file.check()
 | |
|         if file.is_archive:
 | |
|             self.process_archive(file)
 | |
|         else:
 | |
|             if file.should_copy:
 | |
|                 file.safe_copy()
 | |
|                 file.set_property('copied', True)
 | |
|                 if not file._validate_random_hashes():
 | |
|                     # Something's fucked up.
 | |
|                     file.make_dangerous('The copied file is different from the one checked, removing.')
 | |
|                     os.remove(file.dst_path)
 | |
|             self.write_file_to_log(file)
 | |
|         # TODO: Can probably handle cleaning up the tempdir better
 | |
|         if hasattr(file, 'tempdir_path'):
 | |
|             self.safe_rmtree(file.tempdir_path)
 | |
| 
 | |
|     def process_archive(self, file):
 | |
|         """
 | |
|         Unpack an archive using 7zip and process contents using process_dir.
 | |
| 
 | |
|         Should be given a Kittengroomer file object whose src_path points
 | |
|         to an archive.
 | |
|         """
 | |
|         self.recursive_archive_depth += 1
 | |
|         if self.recursive_archive_depth >= self.max_recursive_depth:
 | |
|             file.make_dangerous('Archive bomb')
 | |
|         else:
 | |
|             tempdir_path = file.make_tempdir()
 | |
|             command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
 | |
|             # -p1=password, x=extract, -o=output location, -bd=no % indicator, -aoa=overwrite existing files
 | |
|             unpack_command = command_str.format(SEVENZ_PATH,
 | |
|                                                 file.src_path, tempdir_path)
 | |
|             self._run_process(unpack_command)
 | |
|             self.write_file_to_log(file)
 | |
|             self.process_dir(tempdir_path, file.dst_path)
 | |
|             self.safe_rmtree(tempdir_path)
 | |
|         self.recursive_archive_depth -= 1
 | |
| 
 | |
|     def _run_process(self, command_string, timeout=None):
 | |
|         """Run command_string in a subprocess, wait until it finishes."""
 | |
|         args = shlex.split(command_string)
 | |
|         with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout:
 | |
|             try:
 | |
|                 subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout)
 | |
|             except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
 | |
|                 return
 | |
|         return True
 | |
| 
 | |
|     def write_file_to_log(self, file):
 | |
|         """Pass information about `file` to self.logger."""
 | |
|         props = file.get_all_props()
 | |
|         if not file.is_archive:
 | |
|             # FIXME: in_tempdir is a hack to make image files appear at the correct tree depth in log
 | |
|             in_tempdir = os.path.exists(file.tempdir_path)
 | |
|             self.logger.add_file(file.src_path, props, in_tempdir)
 | |
| 
 | |
|     def list_files_dirs(self, root_dir_path):
 | |
|         """
 | |
|         Returns a list of all files and directories
 | |
| 
 | |
|         Performs a depth-first traversal of the file tree.
 | |
|         """
 | |
|         queue = []
 | |
|         for path in sorted(os.listdir(root_dir_path), key=lambda x: str.lower(x)):
 | |
|             full_path = os.path.join(root_dir_path, path)
 | |
|             # check for symlinks first to prevent getting trapped in infinite symlink recursion
 | |
|             if os.path.islink(full_path):
 | |
|                 queue.append(full_path)
 | |
|             elif os.path.isdir(full_path):
 | |
|                 queue.append(full_path)
 | |
|                 queue += self.list_files_dirs(full_path)
 | |
|             elif os.path.isfile(full_path):
 | |
|                 queue.append(full_path)
 | |
|         return queue
 | |
| 
 | |
|     def run(self):
 | |
|         self.process_dir(self.src_root_path, self.dst_root_path)
 | |
| 
 | |
| 
 | |
| def main(kg_implementation, description):
 | |
|     parser = argparse.ArgumentParser(prog='KittenGroomer', description=description)
 | |
|     parser.add_argument('-s', '--source', type=str, help='Source directory')
 | |
|     parser.add_argument('-d', '--destination', type=str, help='Destination directory')
 | |
|     args = parser.parse_args()
 | |
|     kg = kg_implementation(args.source, args.destination)
 | |
|     kg.run()
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     main(KittenGroomerFileCheck, 'File sanitizer used in CIRCLean. Renames potentially dangerous files.')
 |