diff --git a/.travis.yml b/.travis.yml index 54a7025..6ca0532 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,8 +5,8 @@ python: - 3.4 - 3.5 - 3.6 - - "3.6-dev" - - nightly + # - "3.6-dev" + # - nightly sudo: required # https://docs.travis-ci.com/user/ci-environment/#Virtualization-environments @@ -31,8 +31,7 @@ install: - wget https://didierstevens.com/files/software/pdfid_v0_2_1.zip - unzip pdfid_v0_2_1.zip - pip install -U pip - - pip install lxml exifread pillow olefile - - pip install git+https://github.com/decalage2/oletools.git + - pip install lxml exifread pillow olefile oletools - pip install git+https://github.com/grierforensics/officedissector.git # PyCIRCLean dependencies - pip install -r dev-requirements.txt @@ -45,7 +44,8 @@ install: - pushd theZoo/malwares/Binaries - python unpackall.py - popd - - mv theZoo/malwares/Binaries/out tests/uncategorized/ + - mkdir tests/uncategorized/the_zoo/ + - mv theZoo/malwares/Binaries/out tests/uncategorized/the_zoo/ # Path traversal attacks - git clone https://github.com/jwilk/path-traversal-samples - pushd path-traversal-samples @@ -56,23 +56,30 @@ install: - make - popd - popd - - mv path-traversal-samples/zip/*.zip tests/uncategorized/ - - mv path-traversal-samples/rar/*.rar tests/uncategorized/ + - mkdir tests/uncategorized/path_traversal_zip/ + - mkdir tests/uncategorized/path_traversal_rar/ + - mv path-traversal-samples/zip/*.zip tests/uncategorized/path_traversal_zip + - mv path-traversal-samples/rar/*.rar tests/uncategorized/path_traversal_rar # Office docs - git clone https://github.com/eea/odfpy.git - - mv odfpy/tests/examples/* tests/uncategorized/ - - pushd tests/uncategorized/ + - mkdir tests/uncategorized/odfpy/ + - mv odfpy/tests/examples/* tests/uncategorized/odfpy/ + - mkdir tests/uncategorized/olefile + - pushd tests/uncategorized/olefile - wget https://bitbucket.org/decalage/olefileio_pl/raw/3073963b640935134ed0da34906fea8e506460be/Tests/images/test-ole-file.doc + - popd + - mkdir tests/uncategorized/fraunhofer && pushd tests/uncategorized/fraunhofer - wget --no-check-certificate https://www.officedissector.com/corpus/fraunhoferlibrary.zip - unzip -o fraunhoferlibrary.zip - rm fraunhoferlibrary.zip - popd + # Turned off unzipping 42.zip because it isn't included in the file catalog and archivebomb.zip ends up testing the same thing # - pushd tests/dangerous/ # - 7z x -p42 42.zip # - popd script: - - travis_wait py.test --cov=kittengroomer --cov=bin tests/ + - travis_wait py.test --cov=kittengroomer --cov=filecheck tests/ notifications: email: diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a4c16e..ba57c0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,25 @@ Changelog ========= -2.2.0 (in progress) +2.2.0 --- New features: - Filecheck.py configuration information is now conveniently held in a Config object instead of in globals - New easier to read text-based logger (removed twiggy dependency) - Various filetypes in filecheck.py now have improved descriptions for log -- Improved the interface for adding file descriptions to files +- Improved the PyCIRCLean API interface for adding file descriptions to files +- New integration test harness using a sample file catalog Fixes: -- +- Switched back to released version of oletools +- Use set of malicious extensions from Chrome +- Check for XML Forms Architectures in PDFs +- Symlinks were being followed +- Prevent copying MacOS hidden files +- Fixes for several filetypes that were incorrectly being identified as dangerous +- Fix support for .rar archives +- Turn off executable bit on copied files 2.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e514393..4528d09 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -10,6 +10,12 @@ This project is in active development, so any contributions are welcome! Setting up a dev environment ============================ +* PyCIRCLean requires a working Python 3.3+ install. Before beginning install, it is recommended +to set up a virtualenv to contain Python dependencies. If you don't have experience managing Python virtualenvs, +[pyenv](https://github.com/pyenv/pyenv) and [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv) are great +tools. If you're running MacOS or Windows and would like to contribute to filecheck.py, you will need access to a VM using +either a cloud service or something like Virtualbox. + * First, you'll want to get a local copy of PyCIRCLean. If you'd like to make a pull request with your changes at some point, you should fork the project on github, and then `git clone` your fork. @@ -18,22 +24,17 @@ your fork. you can use `pip install dev-requirements.txt` to ensure you download any testing dependencies as well. We recommend that you use a virtualenv when installing dependencies. Note: python-magic has a non-Python dependency, libmagic. It is typically included in Linux distributions, but you might have to install -it with homebrew (`brew install libmagic`) on macOS. +it with homebrew (`brew install libmagic`) on MacOS. -* Some of the example scripts have additional dependencies for handling various filetypes. You'll have to -install these seperately if you want to try out the examples or modify them for your own purposes. -Please open an issue if you have suggestions of good alternatives for the libraries we use for file handling -or if you have an example you'd like to contribute. +* To install the dependencies for filecheck.py on Linux, you can run `make install` or view the [Makefile](./Makefile) and +install the dependencies manually. Note that `pip install lxml` can only be run after `apt-get libxml2-dev`. Running the tests ================= -* Running the tests is fairly straightforward. * First, make sure you've installed the project and testing dependencies. * Then, run `python -m pytest` or just `pytest` in the top level directory of the module. -* Each integration test that runs will generate a timestamped copy of the log for that run -in the tests/testlogs directory. * If you'd like to get information about code coverage, run the tests using `pytest --cov=kittengroomer`. * You can test with multiple versions of Python if you have them installed diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f209ee7 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +dev-install: + sudo apt-get update + sudo apt-get -y p7zip-full p7zip-rar libxml2-dev libxslt1-dev + pip install -r dev-requirements.txt + pip install lxml exifread pillow olefile oletools + pip install git+https://github.com/grierforensics/officedissector.git + wget https://didierstevens.com/files/software/pdfid_v0_2_1.zip + unzip pdfid_v0_2_1.zip diff --git a/README.md b/README.md index 0f368ab..5752c0d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ PyCIRCLean is the core Python code used by [CIRCLean](https://github.com/CIRCL/Circlean/), an open-source USB key and document sanitizer created by [CIRCL](https://www.circl.lu/). This module has been separated from the device-specific scripts and can be used for dedicated security applications to sanitize documents from hostile environments -to trusted environments. PyCIRCLean is currently Python 3.3+ compatible. +to trusted environments. PyCIRCLean is currently Python 3.3+ compatible. Also, while [kittengroomer](./kittengroomer) can +run on any platform supported by python-magic/libmagic, [filecheck.py](./filecheck/filecheck.py) has some dependencies that +are Linux-only, and running the full test suite will require access to a Linux box or VM. # Installation @@ -27,11 +29,11 @@ PyCIRCLean is designed to be extended to cover specific checking and sanitization workflows in different organizations such as industrial environments or restricted/classified ICT environments. A series of practical examples utilizing PyCIRCLean can be found in the [./examples](./examples) directory. Note: for commits beyond version 2.2.0 these -examples are not guaranteed to work with the PyCIRCLean API. Please check [helpers.py](./kittengroomer/helpers.py) or -[filecheck.py](./bin/filecheck.py) to see the new API interface. +examples are out of date and not guaranteed to work with the PyCIRCLean API. Please check [helpers.py](./kittengroomer/ +helpers.py) or [filecheck.py](./filecheck/filecheck.py) to see the new API interface. -The following simple example using PyCIRCLean will only copy files with a .conf extension matching the 'text/plain' MIME -type. If any other file is found in the source directory, the files won't be copied to the destination directory. +The following simple example using PyCIRCLean will only copy files with a .conf extension matching the 'text/plain' +mimetype. If any other file is found in the source directory, the files won't be copied to the destination directory. ~~~python #!/usr/bin/env python @@ -53,8 +55,6 @@ class FileSpec(FileBase): """Init file object, set the extension.""" super(FileSpec, self).__init__(src_path, dst_path) self.valid_files = {} - a, self.extension = os.path.splitext(self.src_path) - self.mimetype = magic.from_file(self.src_path, mime=True).decode("utf-8") # The initial version will only accept the file extensions/mimetypes listed here. self.valid_files.update(Config.configfiles) @@ -69,18 +69,10 @@ class FileSpec(FileBase): # Unexpected mimetype => disallowed valid = False compare_mime = 'Mime: {} - Expected: {}'.format(self.cur_file.mimetype, expected_mime) - self.add_log_details('valid', valid) - if valid: - self.cur_file.log_string = 'Extension: {} - MimeType: {}'.format(self.cur_file.extension, self.cur_file.mimetype) else: self.should_copy = False - if compare_ext is not None: - self.add_log_string(compare_ext) - else: - self.add_log_string(compare_mime) if self.should_copy: self.safe_copy() - self.write_log() class KittenGroomerSpec(KittenGroomerBase): @@ -97,7 +89,7 @@ class KittenGroomerSpec(KittenGroomerBase): """Main function doing the processing.""" to_copy = [] error = [] - for srcpath in self._list_all_files(self.src_root_dir): + for srcpath in self.list_all_files(self.src_root_dir): dstpath = srcpath.replace(self.src_root_dir, self.dst_root_dir) cur_file = FileSpec(srcpath, dstpath) cur_file.check() @@ -110,7 +102,7 @@ if __name__ == '__main__': # How to contribute -We welcome contributions (including bug fixes, new example file processing +We welcome contributions (including bug fixes and new example file processing workflows) via pull requests. We are particularly interested in any new workflows that can be used to improve security in different organizations. If you see any potential enhancements required to support your sanitization workflow, please feel diff --git a/dev-requirements.txt b/dev-requirements.txt index aa0fdd9..b01f1d1 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,3 +2,4 @@ python-magic pytest pytest-cov PyYAML +tox diff --git a/bin/README.md b/filecheck/README.md similarity index 100% rename from bin/README.md rename to filecheck/README.md diff --git a/bin/__init__.py b/filecheck/__init__.py similarity index 100% rename from bin/__init__.py rename to filecheck/__init__.py diff --git a/bin/filecheck.py b/filecheck/filecheck.py similarity index 82% rename from bin/filecheck.py rename to filecheck/filecheck.py index 540f043..c9909b8 100644 --- a/bin/filecheck.py +++ b/filecheck/filecheck.py @@ -19,31 +19,42 @@ from pdfid import PDFiD, cPDFiD from kittengroomer import FileBase, KittenGroomerBase, Logging -SEVENZ_PATH = '/usr/bin/7z' - - class Config: - """Configuration information for Filecheck.""" - + """Configuration information for filecheck.py.""" + # MIMES # Application subtypes (mimetype: 'application/') - mimes_ooxml = ['vnd.openxmlformats-officedocument.'] - mimes_office = ['msword', 'vnd.ms-'] - mimes_libreoffice = ['vnd.oasis.opendocument'] - mimes_rtf = ['rtf', 'richtext'] - mimes_pdf = ['pdf', 'postscript'] - mimes_xml = ['xml'] - mimes_ms = ['dosexec'] - mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop', - 'xz', 'compress', 'gzip', 'tar'] - mimes_data = ['octet-stream'] + mimes_ooxml = ('vnd.openxmlformats-officedocument.',) + mimes_office = ('msword', 'vnd.ms-',) + mimes_libreoffice = ('vnd.oasis.opendocument',) + mimes_rtf = ('rtf', 'richtext',) + mimes_pdf = ('pdf', 'postscript',) + mimes_xml = ('xml',) + mimes_ms = ('dosexec',) + mimes_compressed = ('zip', 'rar', 'x-rar', 'bzip2', 'lzip', 'lzma', 'lzop', + 'xz', 'compress', 'gzip', 'tar',) + mimes_data = ('octet-stream',) + mimes_audio = ('ogg',) # Image subtypes - mimes_exif = ['image/jpeg', 'image/tiff'] - mimes_png = ['image/png'] + mimes_exif = ('image/jpeg', 'image/tiff',) + mimes_png = ('image/png',) # Mimetypes with metadata - mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png'] + mimes_metadata = ('image/jpeg', 'image/tiff', 'image/png',) + # Mimetype aliases + aliases = { + # Win executables + 'application/x-msdos-program': 'application/x-dosexec', + 'application/x-dosexec': 'application/x-msdos-program', + # Other apps with confusing mimetypes + 'application/rtf': 'text/rtf', + 'application/rar': 'application/x-rar', + 'application/ogg': 'audio/ogg', + 'audio/ogg': 'application/ogg' + } + + # EXTS # Commonly used malicious extensions # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ # https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java @@ -98,22 +109,14 @@ class Config: ".sparseimage", ".toast", ".udif", ) - # Aliases - aliases = { - # Win executables - 'application/x-msdos-program': 'application/x-dosexec', - 'application/x-dosexec': 'application/x-msdos-program', - # Other apps with confusing mimetypes - 'application/rtf': 'text/rtf', - } - # Sometimes, mimetypes.guess_type gives unexpected results, such as for .tar.gz files: # In [12]: mimetypes.guess_type('toot.tar.gz', strict=False) # Out[12]: ('application/x-tar', 'gzip') # It works as expected if you do mimetypes.guess_type('application/gzip', strict=False) override_ext = {'.gz': 'application/gzip'} - ignored_mimes = ['inode', 'model', 'multipart', 'example'] + +SEVENZ_PATH = '/usr/bin/7z' class File(FileBase): @@ -124,10 +127,9 @@ class File(FileBase): filetype-specific processing methods. """ - def __init__(self, src_path, dst_path, logger): + def __init__(self, src_path, dst_path): super(File, self).__init__(src_path, dst_path) self.is_archive = False - self.logger = logger self.tempdir_path = self.dst_path + '_temp' subtypes_apps = ( @@ -140,6 +142,7 @@ class File(FileBase): (Config.mimes_ms, self._executables), (Config.mimes_compressed, self._archive), (Config.mimes_data, self._binary_app), + (Config.mimes_audio, self.audio) ) self.app_subtype_methods = self._make_method_dict(subtypes_apps) @@ -162,13 +165,8 @@ class File(FileBase): 'inode': self.inode, } - def _check_dangerous(self): - if not self.has_mimetype: - self.make_dangerous('File has no mimetype') - if not self.has_extension: - self.make_dangerous('File has no extension') - if self.extension in Config.malicious_exts: - self.make_dangerous('Extension identifies file as potentially dangerous') + def __repr__(self): + return "".format(self.filename) def _check_extension(self): """ @@ -179,16 +177,19 @@ class File(FileBase): mimetype based on its extension differs from the mimetype determined by libmagic, then mark the file as dangerous. """ - if self.extension in Config.override_ext: - expected_mimetype = Config.override_ext[self.extension] + if not self.has_extension: + self.make_dangerous('File has no extension') else: - expected_mimetype, encoding = mimetypes.guess_type(self.src_path, - strict=False) - if expected_mimetype in Config.aliases: - expected_mimetype = Config.aliases[expected_mimetype] - is_known_extension = self.extension in mimetypes.types_map.keys() - if is_known_extension and expected_mimetype != self.mimetype: - self.make_dangerous('Mimetype does not match expected mimetype for this extension') + if self.extension in Config.override_ext: + expected_mimetype = Config.override_ext[self.extension] + else: + expected_mimetype, encoding = mimetypes.guess_type(self.src_path, + strict=False) + if expected_mimetype in Config.aliases: + expected_mimetype = Config.aliases[expected_mimetype] + is_known_extension = self.extension in mimetypes.types_map.keys() + if is_known_extension and expected_mimetype != self.mimetype: + self.make_dangerous('Mimetype does not match expected mimetype ({}) for this extension'.format(expected_mimetype)) def _check_mimetype(self): """ @@ -197,15 +198,18 @@ class File(FileBase): Determine whether the extension that are normally associated with the mimetype include the file's actual extension. """ - if self.mimetype in Config.aliases: - mimetype = Config.aliases[self.mimetype] + if not self.has_mimetype: + self.make_dangerous('File has no mimetype') else: - mimetype = self.mimetype - expected_extensions = mimetypes.guess_all_extensions(mimetype, - strict=False) - if expected_extensions: - if self.has_extension and self.extension not in expected_extensions: - self.make_dangerous('Extension does not match expected extensions for this mimetype') + if self.mimetype in Config.aliases: + mimetype = Config.aliases[self.mimetype] + else: + mimetype = self.mimetype + expected_extensions = mimetypes.guess_all_extensions(mimetype, + strict=False) + if expected_extensions: + if self.has_extension and self.extension not in expected_extensions: + self.make_dangerous('Extension does not match expected extensions ({}) for this mimetype'.format(expected_extensions)) def _check_filename(self): """ @@ -219,7 +223,7 @@ class File(FileBase): '.Trashes', '._.Trashes', '.DS_Store', '.fseventsd', '.Spotlight-V100' ) if self.filename in macos_hidden_files: - self.add_description('MacOS hidden metadata file.') + self.add_description('MacOS metadata file, added by MacOS to USB drives and some directories') self.should_copy = False right_to_left_override = u"\u202E" if right_to_left_override in self.filename: @@ -227,34 +231,27 @@ class File(FileBase): new_filename = self.filename.replace(right_to_left_override, '') self.set_property('filename', new_filename) + def _check_malicious_exts(self): + """Check that the file's extension isn't contained in a blacklist""" + if self.extension in Config.malicious_exts: + self.make_dangerous('Extension identifies file as potentially dangerous') + def check(self): """ - Main file processing method + Main file processing method. - Delegates to various helper methods including filetype-specific checks. + First, checks for basic properties that might indicate a dangerous file. + If the file isn't dangerous, then delegates to various helper methods + for filetype-specific checks based on the file's mimetype. """ - if self.maintype in Config.ignored_mimes: - self.should_copy = False - self.mime_processing_options.get(self.maintype, self.unknown)() - else: - self._check_dangerous() - self._check_filename() - if self.has_extension: - self._check_extension() - if self.has_mimetype: - self._check_mimetype() - if not self.is_dangerous: - self.mime_processing_options.get(self.maintype, self.unknown)() + # Any of these methods can call make_dangerous(): + self._check_malicious_exts() + self._check_mimetype() + self._check_extension() + self._check_filename() # can mutate self.filename - def write_log(self): - """Pass information about the file to self.logger""" - props = self.get_all_props() - if not self.is_archive: - if os.path.exists(self.tempdir_path): - # FIXME: Hack to make images appear at the correct tree depth in log - self.logger.add_file(self.src_path, props, in_tempdir=True) - return - self.logger.add_file(self.src_path, props) + if not self.is_dangerous: + self.mime_processing_options.get(self.maintype, self.unknown)() # ##### Helper functions ##### def _make_method_dict(self, list_of_tuples): @@ -287,18 +284,22 @@ class File(FileBase): self.add_description('File is a symlink to {}'.format(symlink_path)) else: self.add_description('File is an inode (empty file)') + self.should_copy = False def unknown(self): """Main type should never be unknown.""" self.add_description('Unknown mimetype') + self.should_copy = False def example(self): """Used in examples, should never be returned by libmagic.""" self.add_description('Example file') + self.should_copy = False def multipart(self): """Used in web apps, should never be returned by libmagic""" self.add_description('Multipart file - usually found in web apps') + self.should_copy = False # ##### Treated as malicious, no reason to have it on a USB key ###### def message(self): @@ -315,12 +316,10 @@ class File(FileBase): for mt in Config.mimes_rtf: if mt in self.subtype: self.add_description('Rich Text (rtf) file') - # TODO: need a way to convert it to plain text self.force_ext('.txt') return for mt in Config.mimes_ooxml: if mt in self.subtype: - self.add_description('OOXML (openoffice) file') self._ooxml() return self.add_description('Plain text file') @@ -328,16 +327,14 @@ class File(FileBase): def application(self): """Process an application specific file according to its subtype.""" - if self.subtype in self.app_subtype_methods: - method = self.app_subtype_methods[self.subtype] - method() - # TODO: should these application methods return a value? - else: - self._unknown_app() + for subtype, method in self.app_subtype_methods.items(): + if subtype in self.subtype: # checking for partial matches + method() + return + self._unknown_app() # if none of the methods match def _executables(self): """Process an executable file.""" - # LOG: change the processing_type property to some other name or include in file_string self.make_dangerous('Executable file') def _winoffice(self): @@ -372,6 +369,7 @@ class File(FileBase): def _ooxml(self): """Process an ooxml file.""" + self.add_description('OOXML (openoffice) file') try: doc = officedissector.doc.Document(self.src_path) except Exception: @@ -388,8 +386,6 @@ class File(FileBase): self.make_dangerous('Ooxml file with embedded objects') if len(doc.features.embedded_packages) > 0: self.make_dangerous('Ooxml file with embedded packages') - if not self.is_dangerous: - self.add_description('Ooxml file') def _libreoffice(self): """Process a libreoffice file.""" @@ -411,7 +407,6 @@ class File(FileBase): """Process a PDF file.""" xmlDoc = PDFiD(self.src_path) oPDFiD = cPDFiD(xmlDoc, True) - # TODO: are there other pdf characteristics which should be dangerous? if oPDFiD.encrypt.count > 0: self.make_dangerous('Encrypted pdf') if oPDFiD.js.count > 0 or oPDFiD.javascript.count > 0: @@ -536,7 +531,6 @@ class File(FileBase): using PIL.Image, saves it to the temporary directory, and copies it to the destination. """ - # TODO: make sure this method works for png, gif, tiff if self.has_metadata: self.extract_metadata() tempdir_path = self.make_tempdir() @@ -587,32 +581,47 @@ class GroomerLogger(object): lf.write(b'\n') def add_file(self, file_path, file_props, in_tempdir=False): - """Add a file to the log. Takes a dict of file properties.""" + """Add a file to the log. Takes a path and a dict of file properties.""" depth = self._get_path_depth(file_path) - description_string = ', '.join(file_props['description_string']) - file_hash = Logging.computehash(file_path)[:6] - if file_props['is_dangerous']: - description_category = "Dangerous" + try: + file_hash = Logging.computehash(file_path)[:6] + except IsADirectoryError: + file_hash = 'directory' + except FileNotFoundError: + file_hash = '------' + if file_props['is_symlink']: + symlink_template = "+- NOT COPIED: symbolic link to {name} ({sha_hash})" + log_string = symlink_template.format( + name=file_props['symlink_path'], + sha_hash=file_hash + ) else: - description_category = "Normal" - size_string = self._format_file_size(file_props['file_size']) - file_template = "+- {name} ({sha_hash}): {size}, type: {mt}/{st}. {desc}: {desc_str}" - file_string = file_template.format( - name=file_props['filename'], - sha_hash=file_hash, - size=size_string, - mt=file_props['maintype'], - st=file_props['subtype'], - desc=description_category, - desc_str=description_string, - ) - # TODO: finish adding Errors and check that they appear properly - # if file_props['errors']: - # error_string = ', '.join([str(key) for key in file_props['errors']]) - # file_string.append(' Errors: ' + error_string) + if file_props['is_dangerous']: + category = "Dangerous" + else: + category = "Normal" + size_string = self._format_file_size(file_props['file_size']) + if not file_props['copied']: + copied_string = 'NOT COPIED: ' + else: + copied_string = '' + file_template = "+- {copied}{name} ({sha_hash}): {size}, type: {mt}/{st}. {cat}: {desc_str}" + log_string = file_template.format( + copied=copied_string, + name=file_props['filename'], + sha_hash=file_hash, + size=size_string, + mt=file_props['maintype'], + st=file_props['subtype'], + cat=category, + desc_str=file_props['description_string'], + ) + if file_props['errors']: + error_string = ', '.join([str(key) for key in file_props['errors']]) + log_string += (' Errors: ' + error_string) if in_tempdir: depth -= 1 - self._write_line_to_log(file_string, depth) + self._write_line_to_log(log_string, depth) def add_dir(self, dir_path): """Add a directory to the log""" @@ -664,6 +673,11 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.max_recursive_depth = max_recursive_depth self.logger = GroomerLogger(root_src, root_dst, debug) + def __repr__(self): + return "filecheck.KittenGroomerFileCheck object: {{{}}}".format( + os.path.basename(self.src_root_path) + ) + def process_dir(self, src_dir, dst_dir): """Process a directory on the source key.""" for srcpath in self.list_files_dirs(src_dir): @@ -671,7 +685,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.logger.add_dir(srcpath) else: dstpath = os.path.join(dst_dir, os.path.basename(srcpath)) - cur_file = File(srcpath, dstpath, self.logger) + cur_file = File(srcpath, dstpath) self.process_file(cur_file) def process_file(self, file): @@ -682,12 +696,13 @@ class KittenGroomerFileCheck(KittenGroomerBase): the file to the destionation key, and clean up temporary directory. """ file.check() - if file.should_copy: - file.safe_copy() - file.set_property('copied', True) - file.write_log() if file.is_archive: self.process_archive(file) + else: + if file.should_copy: + file.safe_copy() + file.set_property('copied', True) + self.write_file_to_log(file) # TODO: Can probably handle cleaning up the tempdir better if hasattr(file, 'tempdir_path'): self.safe_rmtree(file.tempdir_path) @@ -705,10 +720,11 @@ class KittenGroomerFileCheck(KittenGroomerBase): else: tempdir_path = file.make_tempdir() command_str = '{} -p1 x "{}" -o"{}" -bd -aoa' + # -p1=password, x=extract, -o=output location, -bd=no % indicator, -aoa=overwrite existing files unpack_command = command_str.format(SEVENZ_PATH, file.src_path, tempdir_path) self._run_process(unpack_command) - file.write_log() + self.write_file_to_log(file) self.process_dir(tempdir_path, file.dst_path) self.safe_rmtree(tempdir_path) self.recursive_archive_depth -= 1 @@ -723,6 +739,14 @@ class KittenGroomerFileCheck(KittenGroomerBase): return return True + def write_file_to_log(self, file): + """Pass information about `file` to self.logger.""" + props = file.get_all_props() + if not file.is_archive: + # FIXME: in_tempdir is a hack to make image files appear at the correct tree depth in log + in_tempdir = os.path.exists(file.tempdir_path) + self.logger.add_file(file.src_path, props, in_tempdir) + def list_files_dirs(self, root_dir_path): """ Returns a list of all files and directories diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index a31abce..847d3bc 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -12,6 +12,7 @@ import os import hashlib import shutil import argparse +import stat import magic @@ -36,7 +37,7 @@ class FileBase(object): self.is_dangerous = False self.copied = False self.symlink_path = None - self.description_string = [] # array of descriptions to be joined + self._description_string = [] # array of descriptions to be joined self._errors = {} self._user_defined = {} self.should_copy = True @@ -90,18 +91,24 @@ class FileBase(object): @property def description_string(self): - return self.__description_string + if len(self._description_string) == 0: + return 'No description' + elif len(self._description_string) == 1: + return self._description_string[0] + else: + ret_string = ', '.join(self._description_string) + return ret_string.strip(', ') @description_string.setter def description_string(self, value): if hasattr(self, 'description_string'): if isinstance(value, str): - if value not in self.__description_string: - self.__description_string.append(value) + if value not in self._description_string: + self._description_string.append(value) else: raise TypeError("Description_string can only include strings") else: - self.__description_string = value + self._description_string = value def set_property(self, prop_string, value): """ @@ -139,6 +146,7 @@ class FileBase(object): 'subtype': self.subtype, 'extension': self.extension, 'is_dangerous': self.is_dangerous, + 'is_symlink': self.is_symlink, 'symlink_path': self.symlink_path, 'copied': self.copied, 'description_string': self.description_string, @@ -173,7 +181,11 @@ class FileBase(object): self.add_description(reason_string) def safe_copy(self, src=None, dst=None): - """Copy file and create destination directories if needed.""" + """ + Copy file and create destination directories if needed. + + Sets all exec bits to '0'. + """ if src is None: src = self.src_path if dst is None: @@ -181,6 +193,10 @@ class FileBase(object): try: os.makedirs(self.dst_dir, exist_ok=True) shutil.copy(src, dst) + current_perms = self._get_file_permissions(dst) + only_exec_bits = 0o0111 + perms_no_exec = current_perms & (~only_exec_bits) + os.chmod(dst, perms_no_exec) except IOError as e: # Probably means we can't write in the dest dir self.add_error(e, '') @@ -234,16 +250,14 @@ class FileBase(object): else: try: mt = magic.from_file(file_path, mime=True) - # libmagic will always return something, even if it's just 'data' + # libmagic always returns something, even if it's just 'data' except UnicodeEncodeError as e: - # FIXME: The encoding of the file that triggers this is broken (possibly it's UTF-16 and Python expects utf8) - # Note: one of the Travis files will trigger this exception self.add_error(e, '') mt = None try: mimetype = mt.decode("utf-8") except: - # FIXME: what should the exception be here if mimetype isn't utf-8? + # FIXME: what should the exception be if mimetype isn't utf-8? mimetype = mt return mimetype @@ -262,6 +276,15 @@ class FileBase(object): size = 0 return size + def _remove_exec_bit(self, file_path): + current_perms = self._get_file_permissions(file_path) + perms_no_exec = current_perms & (~stat.S_IEXEC) + os.chmod(file_path, perms_no_exec) + + def _get_file_permissions(self, file_path): + full_mode = os.stat(file_path, follow_symlinks=False).st_mode + return stat.S_IMODE(full_mode) + class Logging(object): @@ -304,7 +327,6 @@ class KittenGroomerBase(object): def list_all_files(self, directory_path): """Generator yielding path to all of the files in a directory tree.""" for root, dirs, files in os.walk(directory_path): - # files is a list anyway so we don't get much from using a generator here for filename in files: filepath = os.path.join(root, filename) yield filepath @@ -329,7 +351,12 @@ class ImplementationRequired(KittenGroomerError): pass -def main(kg_implementation, description='Call a KittenGroomer implementation to process files present in the source directory and copy them to the destination directory.'): +def main( + kg_implementation, + description=("Call a KittenGroomer implementation to process files " + "present in the source directory and copy them to the " + "destination directory.")): + print(description) parser = argparse.ArgumentParser(prog='KittenGroomer', description=description) parser.add_argument('-s', '--source', type=str, help='Source directory') parser.add_argument('-d', '--destination', type=str, help='Destination directory') diff --git a/scripts/run_filecheck_single_file.py b/scripts/run_filecheck_single_file.py new file mode 100644 index 0000000..949e129 --- /dev/null +++ b/scripts/run_filecheck_single_file.py @@ -0,0 +1,29 @@ +import sys + +from filecheck.filecheck import File + + +PATH = 'tests/dangerous/bypass.docx' +# PATH = 'tests/normal/word_docx.docx' + + +def main(): + try: + file = File(sys.argv[1], '') + except IndexError: + file = File(PATH, '') + file.check() + print( + "Name: " + file.filename, + "Desc: " + file.description_string, + "Mime: " + file.mimetype, + "Desc list: " + repr(file._description_string), + "Size: " + str(file.size), + "Src path: " + file.src_path, + "Is dangerous: " + str(file.is_dangerous), + sep='\n' + ) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index 21b03f9..1711e45 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import setup setup( name='kittengroomer', - version='2.1.0', + version='2.2.0', author='Raphaël Vinot', author_email='raphael.vinot@circl.lu', maintainer='Raphaël Vinot', @@ -12,7 +12,7 @@ setup( description='Standalone CIRCLean/KittenGroomer code.', packages=['kittengroomer'], scripts=[ - 'bin/filecheck.py' + 'filecheck/filecheck.py' ], classifiers=[ 'License :: OSI Approved :: BSD License', diff --git a/tests/normal/Example.svg b/tests/dangerous/Example.svg similarity index 100% rename from tests/normal/Example.svg rename to tests/dangerous/Example.svg diff --git a/tests/file_catalog.yaml b/tests/file_catalog.yaml index 13a6ab9..7b40862 100644 --- a/tests/file_catalog.yaml +++ b/tests/file_catalog.yaml @@ -12,12 +12,8 @@ normal: Example.ogg: # Added: 27-06-2017, source: https://en.wikipedia.org/wiki/File:Example.ogg description: Ogg vorbis sound file mimetype: audio/ogg - xfail: True Example.png: # Added: 27-06-2017, source: https://en.wikipedia.org/wiki/File:Example.png mimetype: image/png - Example.svg: # Added: 27-06-2017, source: https://en.wikipedia.org/wiki/File:Example.svg - mimetype: image/svg+xml - xfail: True pdf-sample.pdf: # Added: 27-06-2017, source: http://che.org.il/wp-content/uploads/2016/12/pdf-sample.pdf mimetype: application/pdf plaintext.txt: # Added: 27-06-2017, source: hand-generated @@ -25,11 +21,13 @@ normal: rar_archive.rar: # Added: 27-06-2017, Rar archive. Source: hand-generated description: rar archive mimetype: application/x-rar - xfail: True rich_text.rtf: # Added 27-06-2017), source: hand-generated mimetype: text/rtf sample_mpeg4.mp4: # Added 28-06-2017, source: https://support.apple.com/en-us/HT201549 mimetype: video/mp4 + word_docx.docx: # Added 24-07-2017, source: hand-generated using MacOS Microsoft Word 2011 + description: normal word document + mimetype: application/vnd.openxmlformats-officedocument.wordprocessingml.document zip_archive.zip: # Added 27-06-2017, source: hand-generated mimetype: application/zip @@ -48,6 +46,9 @@ dangerous: config_file.conf: # Added 27-06-2017, source: hand-generated description: config file mimetype: text/plain + Example.svg: # Added: 27-06-2017, source: https://en.wikipedia.org/wiki/File:Example.svg + description: normal svg file, should probably be replaced by a dangerous svg + mimetype: image/svg+xml message.msg: # Added 27-06-2017, source: ???? description: message file, used by Outlook etc mimetype: message/rfc822 diff --git a/tests/logging/.keepdir b/tests/logging/.keepdir new file mode 100644 index 0000000..e69de29 diff --git a/tests/logging/dir1/dir2/test.txt b/tests/logging/dir1/dir2/test.txt new file mode 100644 index 0000000..30d74d2 --- /dev/null +++ b/tests/logging/dir1/dir2/test.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/tests/logging/symlink_test b/tests/logging/symlink_test new file mode 120000 index 0000000..541cb64 --- /dev/null +++ b/tests/logging/symlink_test @@ -0,0 +1 @@ +test.txt \ No newline at end of file diff --git a/tests/logging/test.conf b/tests/logging/test.conf new file mode 100644 index 0000000..30d74d2 --- /dev/null +++ b/tests/logging/test.conf @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/tests/logging/test.txt b/tests/logging/test.txt new file mode 100644 index 0000000..30d74d2 --- /dev/null +++ b/tests/logging/test.txt @@ -0,0 +1 @@ +test \ No newline at end of file diff --git a/tests/normal/word_docx.docx b/tests/normal/word_docx.docx new file mode 100644 index 0000000..b28b11b Binary files /dev/null and b/tests/normal/word_docx.docx differ diff --git a/tests/test_filecheck.py b/tests/test_filecheck.py index 1282a0e..78c72ed 100644 --- a/tests/test_filecheck.py +++ b/tests/test_filecheck.py @@ -2,13 +2,12 @@ # -*- coding: utf-8 -*- import os -import unittest.mock as mock import pytest import yaml try: - from bin.filecheck import KittenGroomerFileCheck, File, GroomerLogger + from filecheck.filecheck import KittenGroomerFileCheck, File NODEPS = False except ImportError: NODEPS = True @@ -95,6 +94,7 @@ def get_filename(sample_file): def src_dir_path(tmpdir_factory): return tmpdir_factory.mktemp('src').strpath + @fixture(scope='module') def dest_dir_path(tmpdir_factory): return tmpdir_factory.mktemp('dest').strpath @@ -106,21 +106,18 @@ def groomer(dest_dir_path): return KittenGroomerFileCheck(dummy_src_path, dest_dir_path, debug=True) -@fixture -def mock_logger(dest_dir_path): - return mock.MagicMock(spec=GroomerLogger) - - @parametrize( argnames="sample_file", argvalues=gather_sample_files(), ids=get_filename) -def test_sample_files(mock_logger, sample_file, groomer, dest_dir_path): +def test_sample_files(sample_file, groomer, dest_dir_path): if sample_file.xfail: pytest.xfail(reason='Marked xfail in file catalog') file_dest_path = os.path.join(dest_dir_path, sample_file.filename) - file = File(sample_file.path, file_dest_path, mock_logger) + file = File(sample_file.path, file_dest_path) groomer.process_file(file) + print(file.description_string) + print(file.mimetype) assert file.is_dangerous == sample_file.exp_dangerous diff --git a/tests/test_filecheck_logging.py b/tests/test_filecheck_logging.py index ddb9066..08c6c2c 100644 --- a/tests/test_filecheck_logging.py +++ b/tests/test_filecheck_logging.py @@ -2,22 +2,30 @@ # -*- coding: utf-8 -*- import os -import datetime +from datetime import datetime import pytest +try: + from filecheck.filecheck import KittenGroomerFileCheck + NODEPS = False +except ImportError: + NODEPS = True +pytestmark = pytest.mark.skipif(NODEPS, reason="Dependencies aren't installed") + def save_logs(groomer, test_description): divider = ('=' * 10 + '{}' + '=' * 10 + '\n') - test_log_path = 'tests/test_logs/{}.log'.format(test_description) + test_log_path = 'tests/{}.log'.format(test_description) time_now = str(datetime.now().time()) + '\n' with open(test_log_path, 'wb+') as test_log: - log_header = divider.format('TEST LOG') - test_log.write(bytes(log_header, encoding='utf-8')) + test_log_header = divider.format('TEST LOG') + test_log.write(bytes(test_log_header, encoding='utf-8')) test_log.write(bytes(time_now, encoding='utf-8')) test_log.write(bytes(test_description, encoding='utf-8')) test_log.write(b'\n') - test_log.write(b'-' * 20 + b'\n') + log_header = divider.format('STD LOG') + test_log.write(bytes(log_header, encoding='utf-8')) with open(groomer.logger.log_path, 'rb') as logfile: log = logfile.read() test_log.write(log) @@ -31,3 +39,9 @@ def save_logs(groomer, test_description): with open(groomer.logger.log_debug_out, 'rb') as debug_out: out = debug_out.read() test_log.write(out) + + +def test_logging(tmpdir): + groomer = KittenGroomerFileCheck('tests/logging/', tmpdir.strpath) + groomer.run() + save_logs(groomer, "visual_logging_test") diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index 1856022..1b4dcaa 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -172,13 +172,18 @@ class TestFileBase: def test_add_new_description(self, text_file): """Adding a new description should add it to the list of description strings.""" text_file.add_description('thing') - assert text_file.get_property('description_string') == ['thing'] + assert text_file.get_property('description_string') == 'thing' def test_add_description_exists(self, text_file): """Adding a description that already exists shouldn't duplicate it.""" text_file.add_description('thing') text_file.add_description('thing') - assert text_file.get_property('description_string') == ['thing'] + assert text_file.get_property('description_string') == 'thing' + + def test_add_multiple_descriptions(self, text_file): + text_file.add_description('thing') + text_file.add_description('foo') + assert text_file.get_property('description_string') == 'thing, foo' def test_add_description_not_string(self, text_file): """Adding a description that isn't a string should raise an error.""" @@ -205,7 +210,7 @@ class TestFileBase: """Marking a file as dangerous and passing in a description should add that description to the file.""" text_file.make_dangerous('thing') - assert text_file.get_property('description_string') == ['thing'] + assert text_file.get_property('description_string') == 'thing' def test_dangerous_file_mark_dangerous(self, text_file): """Marking a dangerous file as dangerous should do nothing, and the @@ -251,6 +256,11 @@ class TestFileBase: file.safe_copy() mock_copy.assert_called_once_with(file_path, dst_path) + def test_safe_copy_removes_exec_perms(self): + """`safe_copy` should create a file that doesn't have any of the + executable bits set.""" + pass + def test_safe_copy_makedir_doesnt_exist(self): """Calling safe_copy should create intermediate directories in the path if they don't exist."""