From dc76e00554011517985990c14a019160cf6ad414 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 8 Feb 2017 12:54:40 -0500 Subject: [PATCH 01/36] Removed broken coveralls link in README --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 19eb6d3..681a6d2 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,11 @@ [![Build Status](https://travis-ci.org/CIRCL/PyCIRCLean.svg?branch=master)](https://travis-ci.org/CIRCL/PyCIRCLean) [![codecov.io](https://codecov.io/github/CIRCL/PyCIRCLean/coverage.svg?branch=master)](https://codecov.io/github/CIRCL/PyCIRCLean?branch=master) -[![Coverage Status](https://coveralls.io/repos/github/Rafiot/PyCIRCLean/badge.svg?branch=master)](https://coveralls.io/github/Rafiot/PyCIRCLean?branch=master) # PyCIRCLean PyCIRCLean is the core Python code used by [CIRCLean](https://github.com/CIRCL/Circlean/), an open-source -USB key and document sanitizer created by [CIRCL](https://www.circl.lu/). This module has been separated from the -device-specific scripts and can be used for dedicated security applications to sanitize documents from hostile environments +USB key and document sanitizer created by [CIRCL](https://www.circl.lu/). This module has been separated from the +device-specific scripts and can be used for dedicated security applications to sanitize documents from hostile environments to trusted environments. PyCIRCLean is currently Python 3.3+ only. # Installation From 3cc9aa23f25903b51df156be406f59bdcaabae50 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 8 Feb 2017 13:03:19 -0500 Subject: [PATCH 02/36] Clarify language in README --- README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 681a6d2..f0a0cf3 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,9 @@ pip install . # How to use PyCIRCLean -PyCIRCLean is a simple Python library to handle file checking and sanitization. PyCIRCLean is designed as a simple library -that can be overloaded to cover specific checking and sanitization workflows in different organizations like industrial +PyCIRCLean is a simple Python library to handle file checking and sanitization. +PyCIRCLean is designed to be overloaded and extended to cover specific checking +and sanitization workflows in different organizations such as industrial environments or restricted/classified ICT environments. A series of practical examples utilizing PyCIRCLean can be found in the [./examples](./examples) directory. @@ -125,9 +126,12 @@ if __name__ == '__main__': # How to contribute -We welcome contributions (including bug fixes, new code workflows) via pull requests. We are interested in any new workflows -that can be used to improve security in different organizations. If you see any potential enhancements required to support -your sanitization workflow, please feel free to open an issue. Read [CONTRIBUTING.md](/CONTRIBUTING.md) for more information. +We welcome contributions (including bug fixes, new example file processing +workflows) via pull requests. We are particularly interested in any new workflows +that can be used to improve security in different organizations. If you see any +potential enhancements required to support your sanitization workflow, please feel +free to open an issue. Read [CONTRIBUTING.md](/CONTRIBUTING.md) for more +information. # License From 0be926c0eb8bc92f38a585beee830cc0614a3338 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Fri, 10 Feb 2017 16:46:02 -0500 Subject: [PATCH 03/36] Reorganize KittenGroomerBase __init__ --- kittengroomer/helpers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 4e82a73..a5f47a2 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -163,21 +163,21 @@ class KittenGroomerBase(object): """Initialized with path to source and dest directories.""" self.src_root_dir = root_src self.dst_root_dir = root_dst + self.debug = debug + self.cur_file = None + # Setup logs self.log_root_dir = os.path.join(self.dst_root_dir, 'logs') self._safe_rmtree(self.log_root_dir) self._safe_mkdir(self.log_root_dir) self.log_processing = os.path.join(self.log_root_dir, 'processing.log') self.log_content = os.path.join(self.log_root_dir, 'content.log') - self.tree(self.src_root_dir) - quick_setup(file=self.log_processing) self.log_name = log.name('files') + + self.tree(self.src_root_dir) self.resources_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data') os.environ["PATH"] += os.pathsep + self.resources_path - self.cur_file = None - - self.debug = debug if self.debug: self.log_debug_err = os.path.join(self.log_root_dir, 'debug_stderr.log') self.log_debug_out = os.path.join(self.log_root_dir, 'debug_stdout.log') From 774095d95a0c0b64585394d97503092c8e1c3732 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Fri, 10 Feb 2017 19:17:07 -0500 Subject: [PATCH 04/36] Fix list_all_files docstring --- kittengroomer/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index a5f47a2..d140ecc 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -283,7 +283,7 @@ class KittenGroomerBase(object): return False def _list_all_files(self, directory): - """Generate an iterator over all the files in a directory tree.""" + """Generator yield path to all of the files in a directory tree.""" for root, dirs, files in os.walk(directory): for filename in files: filepath = os.path.join(root, filename) From e2af701ac9ab9234119e79baa8bf0139f5da9357 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Fri, 10 Feb 2017 19:27:53 -0500 Subject: [PATCH 05/36] Remove several pieces of unused code * Remove python 2 KittenGroomerBase.tree * Remove default source and dest from KittenGroomerFileCheck * Remove unused sys import --- bin/filecheck.py | 6 +----- kittengroomer/helpers.py | 22 ---------------------- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 8440bd6..9a83b27 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -144,11 +144,7 @@ class File(FileBase): class KittenGroomerFileCheck(KittenGroomerBase): - def __init__(self, root_src=None, root_dst=None, max_recursive_depth=2, debug=False): - if root_src is None: - root_src = os.path.join(os.sep, 'media', 'src') - if root_dst is None: - root_dst = os.path.join(os.sep, 'media', 'dst') + def __init__(self, root_src, root_dst, max_recursive_depth=2, debug=False): super(KittenGroomerFileCheck, self).__init__(root_src, root_dst, debug) self.recursive_archive_depth = 0 self.max_recursive_depth = max_recursive_depth diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index d140ecc..3ccbb08 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -9,7 +9,6 @@ desired behavior. import os -import sys import hashlib import shutil import argparse @@ -198,27 +197,6 @@ class KittenGroomerBase(object): def tree(self, base_dir, padding=' '): """Writes a graphical tree to the log for a given directory.""" - if sys.version_info.major == 2: - self.__tree_py2(base_dir, padding) - else: - self.__tree_py3(base_dir, padding) - - def __tree_py2(self, base_dir, padding=' '): - with open(self.log_content, 'ab') as lf: - lf.write('#' * 80 + '\n') - lf.write('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)))) - padding += '| ' - files = sorted(os.listdir(base_dir)) - for f in files: - curpath = os.path.join(base_dir, f) - if os.path.islink(curpath): - lf.write('{}+-- {}\t- Symbolic link to {}\n'.format(padding, f, os.readlink(curpath))) - elif os.path.isdir(curpath): - self.tree(curpath, padding) - elif os.path.isfile(curpath): - lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath))) - - def __tree_py3(self, base_dir, padding=' '): with open(self.log_content, 'ab') as lf: lf.write(bytes('#' * 80 + '\n', 'UTF-8')) lf.write(bytes('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)).encode()), 'utf8')) From 92d1b1cd9332ab62d8d6509ccb031c54b81d76ba Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 16 Feb 2017 17:27:00 -0500 Subject: [PATCH 06/36] Refactor metadata processing code --- bin/filecheck.py | 23 ++++++++++++---------- kittengroomer/helpers.py | 38 +++++++++++++++++++------------------ tests/test_kittengroomer.py | 22 ++++++++++----------- 3 files changed, 43 insertions(+), 40 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 9a83b27..1c559f8 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -427,7 +427,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): ####################### # Metadata extractors - def _metadata_exif(self, metadata_file): + def _metadata_exif(self, metadata_file_path): img = open(self.cur_file.src_path, 'rb') tags = None @@ -457,19 +457,22 @@ class KittenGroomerFileCheck(KittenGroomerBase): printable = value else: printable = str(value) - metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable)) + + with open(metadata_file_path, 'w+') as metadata_file: + metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable)) self.cur_file.add_log_details('metadata', 'exif') img.close() return True - def _metadata_png(self, metadataFile): + def _metadata_png(self, metadata_file_path): warnings.simplefilter('error', Image.DecompressionBombWarning) try: img = Image.open(self.cur_file.src_path) for tag in sorted(img.info.keys()): # These are long and obnoxious/binary if tag not in ('icc_profile'): - metadataFile.write("Key: {}\tValue: {}\n".format(tag, img.info[tag])) + with open(metadata_file_path, 'w+') as metadata_file: + metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag])) self.cur_file.add_log_details('metadata', 'png') img.close() # Catch decompression bombs @@ -481,12 +484,12 @@ class KittenGroomerFileCheck(KittenGroomerBase): return False def extract_metadata(self): - metadata_file = self._safe_metadata_split(".metadata.txt") - success = self.metadata_processing_options.get(self.cur_file.mimetype)(metadata_file) - metadata_file.close() - if not success: - # FIXME Delete empty metadata file - pass + metadata_file_path = self.cur_file.create_metadata_file(".metadata.txt") + # todo: write metadata to file + mime = self.cur_file.mimetype + metadata_processing_method = self.metadata_processing_options.get(mime) + if metadata_processing_method: + metadata_processing_method(metadata_file_path) ####################### # ##### Media - audio and video aren't converted ###### diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 3ccbb08..14cd065 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -154,6 +154,25 @@ class FileBase(object): self.log_details['force_ext'] = True self.dst_path += ext + def create_metadata_file(self, ext): + """Create a separate file to hold this file's metadata.""" + try: + # make sure we aren't overwriting anything + if os.path.exists(self.src_path + ext): + raise KittenGroomerError("Cannot create split metadata file for \"" + + self.dst_path + "\", type '" + + ext + "': File exists.") + else: + # TODO: Uncomment these after object relationships are fixed + # dst_dir_path, filename = os.path.split(self.dst_path) + # self._safe_mkdir(dst_dir_path) + # TODO: Check extension for leading "." + self.metadata_file_path = self.dst_path + ext + return self.metadata_file_path + except KittenGroomerError as e: + # TODO: Write to log file + return False + class KittenGroomerBase(object): """Base object responsible for copy/sanitization process.""" @@ -243,25 +262,8 @@ class KittenGroomerBase(object): print(e) return False - def _safe_metadata_split(self, ext): - """Create a separate file to hold this file's metadata.""" - # TODO: fix logic in this method - dst = self.cur_file.dst_path - try: - if os.path.exists(self.cur_file.src_path + ext): # should we check dst_path as well? - raise KittenGroomerError("Cannot create split metadata file for \"" + - self.cur_file.dst_path + "\", type '" + - ext + "': File exists.") - dst_path, filename = os.path.split(dst) - self._safe_mkdir(dst_path) - return open(dst + ext, 'w+') - except Exception as e: - # TODO: Logfile - print(e) - return False - def _list_all_files(self, directory): - """Generator yield path to all of the files in a directory tree.""" + """Generator yielding path to all of the files in a directory tree.""" for root, dirs, files in os.walk(directory): for filename in files: filepath = os.path.join(root, filename) diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index 9698a95..ac92002 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -213,6 +213,16 @@ class TestFileBase: assert generic_conf_file.log_details.get('force_ext') is None # shouldn't change a file's extension if it already is right + def test_create_metadata_file(self, temp_file): + # Try making a metadata file + metadata_file_path = temp_file.create_metadata_file('.metadata.txt') + with open(metadata_file_path, 'w+') as metadata_file: + metadata_file.write('Have some metadata!') + # Shouldn't be able to make a metadata file with no extension + assert temp_file.create_metadata_file('') is False + # if metadata file already exists + # if there is no metadata to write should this work? + class TestKittenGroomerBase: @@ -258,18 +268,6 @@ class TestKittenGroomerBase: assert simple_groomer._safe_copy() is True #check that it handles weird file path inputs - def test_safe_metadata_split(self, tmpdir): - file = tmpdir.join('test.txt') - file.write('testing') - simple_groomer = KittenGroomerBase(tmpdir.strpath, tmpdir.strpath) - simple_groomer.cur_file = FileBase(file.strpath, file.strpath) - metadata_file = simple_groomer._safe_metadata_split('metadata.log') - metadata_file.write('Have some metadata!') - metadata_file.close() - assert simple_groomer._safe_metadata_split('') is False - # if metadata file already exists - # if there is no metadata to write should this work? - def test_list_all_files(self, tmpdir): file = tmpdir.join('test.txt') file.write('testing') From 1cf8a62f46efbb56c95595fbeb0f53f2918ec774 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Mon, 20 Feb 2017 19:03:11 -0500 Subject: [PATCH 07/36] First commit with Logger object - Made logger object - Moved some logger related code from Groomer to Logger - Changed logging related tests - Filecheck tests still do not pass --- bin/filecheck.py | 1 + kittengroomer/__init__.py | 2 +- kittengroomer/helpers.py | 92 ++++++++++++++++++------------------- tests/test_kittengroomer.py | 24 ++++------ 4 files changed, 55 insertions(+), 64 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 1c559f8..2535f41 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -204,6 +204,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def _run_process(self, command_string, timeout=None): """Run command_string in a subprocess, wait until it finishes.""" args = shlex.split(command_string) + # TODO: log_debug_err and log_debug are now broken, fix with open(self.log_debug_err, 'ab') as stderr, open(self.log_debug_out, 'ab') as stdout: try: subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout) diff --git a/kittengroomer/__init__.py b/kittengroomer/__init__.py index 39aa699..262988a 100644 --- a/kittengroomer/__init__.py +++ b/kittengroomer/__init__.py @@ -1,4 +1,4 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from .helpers import FileBase, KittenGroomerBase, main +from .helpers import FileBase, KittenGroomerBase, GroomerLog, main diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 14cd065..f009101 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -14,7 +14,7 @@ import shutil import argparse import magic -from twiggy import quick_setup, log +import twiggy class KittenGroomerError(Exception): @@ -38,7 +38,7 @@ class FileBase(object): or methods relevant to a given implementation. """ - def __init__(self, src_path, dst_path): + def __init__(self, src_path, dst_path, logger=None): """Initialized with the source path and expected destination path.""" self.src_path = src_path self.dst_path = dst_path @@ -46,6 +46,7 @@ class FileBase(object): self.log_string = '' self._determine_extension() self._determine_mimetype() + self.logger = logger def _determine_extension(self): _, ext = os.path.splitext(self.src_path) @@ -174,46 +175,25 @@ class FileBase(object): return False -class KittenGroomerBase(object): - """Base object responsible for copy/sanitization process.""" +class GroomerLog(object): + """Groomer logging object""" - def __init__(self, root_src, root_dst, debug=False): - """Initialized with path to source and dest directories.""" - self.src_root_dir = root_src - self.dst_root_dir = root_dst - self.debug = debug - self.cur_file = None - # Setup logs - self.log_root_dir = os.path.join(self.dst_root_dir, 'logs') - self._safe_rmtree(self.log_root_dir) - self._safe_mkdir(self.log_root_dir) - self.log_processing = os.path.join(self.log_root_dir, 'processing.log') - self.log_content = os.path.join(self.log_root_dir, 'content.log') - quick_setup(file=self.log_processing) - self.log_name = log.name('files') - - self.tree(self.src_root_dir) - self.resources_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data') - os.environ["PATH"] += os.pathsep + self.resources_path - - if self.debug: - self.log_debug_err = os.path.join(self.log_root_dir, 'debug_stderr.log') - self.log_debug_out = os.path.join(self.log_root_dir, 'debug_stdout.log') + def __init__(self, root_dir, debug=False): + self.log_dir_path = os.path.join(root_dir, 'logs') + if os.path.exists(self.log_dir_path): + shutil.rmtree(self.log_dir_path) + os.makedirs(self.log_dir_path) + self.log_processing = os.path.join(self.log_dir_path, 'processing.log') + self.log_content = os.path.join(self.log_dir_path, 'content.log') + twiggy.quick_setup(file=self.log_processing) + self.log = twiggy.log.name('files') + if debug: + self.log_debug_err = os.path.join(self.log_dir_path, 'debug_stderr.log') + self.log_debug_out = os.path.join(self.log_dir_path, 'debug_stdout.log') else: self.log_debug_err = os.devnull self.log_debug_out = os.devnull - def _computehash(self, path): - """Returns a sha256 hash of a file at a given path.""" - s = hashlib.sha256() - with open(path, 'rb') as f: - while True: - buf = f.read(0x100000) - if not buf: - break - s.update(buf) - return s.hexdigest() - def tree(self, base_dir, padding=' '): """Writes a graphical tree to the log for a given directory.""" with open(self.log_content, 'ab') as lf: @@ -230,6 +210,32 @@ class KittenGroomerBase(object): elif os.path.isfile(curpath): lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)).encode(errors='ignore')) + def _computehash(self, path): + """Returns a sha256 hash of a file at a given path.""" + s = hashlib.sha256() + with open(path, 'rb') as f: + while True: + buf = f.read(0x100000) + if not buf: + break + s.update(buf) + return s.hexdigest() + + +class KittenGroomerBase(object): + """Base object responsible for copy/sanitization process.""" + + def __init__(self, root_src, root_dst, debug=False): + """Initialized with path to source and dest directories.""" + self.src_root_dir = root_src + self.dst_root_dir = root_dst + self.debug = debug + self.cur_file = None + self.logger = GroomerLog(self.dst_root_dir, debug) + # Add data/ to PATH + # self.resources_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data') + # os.environ["PATH"] += os.pathsep + self.resources_path + # ##### Helpers ##### def _safe_rmtree(self, directory): """Remove a directory tree if it exists.""" @@ -262,6 +268,7 @@ class KittenGroomerBase(object): print(e) return False + # TODO: this isn't a private method, change and edit the groomers as well def _list_all_files(self, directory): """Generator yielding path to all of the files in a directory tree.""" for root, dirs, files in os.walk(directory): @@ -269,18 +276,9 @@ class KittenGroomerBase(object): filepath = os.path.join(root, filename) yield filepath - def _print_log(self): - """ - Print log, should be called after each file. - - You probably want to reimplement it in the subclass. - """ - tmp_log = self.log_name.fields(**self.cur_file.log_details) - tmp_log.info('It did a thing.') - ####################### - def processdir(self, src_dir=None, dst_dir=None): + def processdir(self, src_dir, dst_dir): """ Implement this function in your subclass to define file processing behavior. """ diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index ac92002..bd16c5a 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -5,7 +5,7 @@ import os import pytest -from kittengroomer import FileBase, KittenGroomerBase +from kittengroomer import FileBase, KittenGroomerBase, GroomerLog from kittengroomer.helpers import ImplementationRequired skip = pytest.mark.skip @@ -224,6 +224,12 @@ class TestFileBase: # if there is no metadata to write should this work? +class TestLogger: + @xfail + def test_tree(self, tmpdir): + GroomerLog.tree(tmpdir) + + class TestKittenGroomerBase: @fixture @@ -248,15 +254,6 @@ class TestKittenGroomerBase: debug=True) # we should maybe protect access to self.current_file in some way? - def test_computehash(self, tmpdir): - file = tmpdir.join('test.txt') - file.write('testing') - simple_groomer = KittenGroomerBase(tmpdir.strpath, tmpdir.strpath) - simple_groomer._computehash(file.strpath) - - def test_tree(self, generic_groomer): - generic_groomer.tree(generic_groomer.src_root_dir) - def test_safe_copy(self, tmpdir): file = tmpdir.join('test.txt') file.write('testing') @@ -278,11 +275,6 @@ class TestKittenGroomerBase: assert file.strpath in files assert testdir.strpath not in files - def test_print_log(self, generic_groomer): - with pytest.raises(AttributeError): - generic_groomer._print_log() - # Kind of a bad test, but this should be implemented by the user anyway - def test_processdir(self, generic_groomer): with pytest.raises(ImplementationRequired): - generic_groomer.processdir() + generic_groomer.processdir(None, None) From 7d62238270fa53f2a76a5515dbccc90de2de8bd0 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Mon, 20 Feb 2017 22:38:41 -0500 Subject: [PATCH 08/36] Hacks to make tests pass before fixing --- bin/filecheck.py | 14 ++++++++------ kittengroomer/helpers.py | 10 ++++++++++ tests/logging.py | 10 +++++----- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 2535f41..133ef17 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -82,8 +82,8 @@ MAL_EXTS = ( class File(FileBase): - def __init__(self, src_path, dst_path): - super(File, self).__init__(src_path, dst_path) + def __init__(self, src_path, dst_path, logger): + super(File, self).__init__(src_path, dst_path, logger) self.is_recursive = False self._check_dangerous() if self.is_dangerous(): @@ -148,6 +148,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): super(KittenGroomerFileCheck, self).__init__(root_src, root_dst, debug) self.recursive_archive_depth = 0 self.max_recursive_depth = max_recursive_depth + self.log_name = self.logger.log subtypes_apps = [ (mimes_office, self._winoffice), @@ -193,7 +194,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def _print_log(self): """Print the logs related to the current file being processed.""" # TODO: change name to _write_log - tmp_log = self.log_name.fields(**self.cur_file.log_details) + tmp_log = self.logger.log.fields(**self.cur_file.log_details) if self.cur_file.is_dangerous(): tmp_log.warning(self.cur_file.log_string) elif self.cur_file.log_details.get('unknown') or self.cur_file.log_details.get('binary'): @@ -205,7 +206,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): """Run command_string in a subprocess, wait until it finishes.""" args = shlex.split(command_string) # TODO: log_debug_err and log_debug are now broken, fix - with open(self.log_debug_err, 'ab') as stderr, open(self.log_debug_out, 'ab') as stdout: + with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout: try: subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout) except (subprocess.TimeoutExpired, subprocess.CalledProcessError): @@ -400,7 +401,8 @@ class KittenGroomerFileCheck(KittenGroomerBase): extract_command = '{} -p1 x "{}" -o"{}" -bd -aoa'.format(SEVENZ_PATH, self.cur_file.src_path, tmpdir) self._run_process(extract_command) self.recursive_archive_depth += 1 - self.tree(tmpdir) + # Broken so commenting out for now: + # self.tree(tmpdir) self.processdir(tmpdir, self.cur_file.dst_path) self.recursive_archive_depth -= 1 self._safe_rmtree(tmpdir) @@ -549,7 +551,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): ####################### def process_file(self, srcpath, dstpath, relative_path): - self.cur_file = File(srcpath, dstpath) + self.cur_file = File(srcpath, dstpath, self.logger) self.log_name.info('Processing {} ({}/{})', relative_path, self.cur_file.main_type, diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index f009101..1e24140 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -42,6 +42,7 @@ class FileBase(object): """Initialized with the source path and expected destination path.""" self.src_path = src_path self.dst_path = dst_path + # TODO: rename this. file_information? file_data? file_metadata? file_log? self.log_details = {'filepath': self.src_path} self.log_string = '' self._determine_extension() @@ -221,6 +222,13 @@ class GroomerLog(object): s.update(buf) return s.hexdigest() + def add_file(self): + pass + # Should this return a sublog that the file can then write to? Does that work? Too confusing to understand? + + def add_event(self, event, log_level): + pass + class KittenGroomerBase(object): """Base object responsible for copy/sanitization process.""" @@ -278,6 +286,7 @@ class KittenGroomerBase(object): ####################### + # TODO: feels like this funciton doesn't need to exist if we move main() def processdir(self, src_dir, dst_dir): """ Implement this function in your subclass to define file processing behavior. @@ -285,6 +294,7 @@ class KittenGroomerBase(object): raise ImplementationRequired('Please implement processdir.') +# TODO: Maybe this shouldn't exist? It should probably get moved to filecheck since this isn't really API code def main(kg_implementation, description='Call a KittenGroomer implementation to process files present in the source directory and copy them to the destination directory.'): parser = argparse.ArgumentParser(prog='KittenGroomer', description=description) parser.add_argument('-s', '--source', type=str, help='Source directory') diff --git a/tests/logging.py b/tests/logging.py index e625137..c937a71 100644 --- a/tests/logging.py +++ b/tests/logging.py @@ -6,17 +6,17 @@ def save_logs(groomer, test_description): test_log_path = 'tests/test_logs/{}.log'.format(test_description) with open(test_log_path, 'w+') as test_log: test_log.write(divider.format('TEST LOG')) - with open(groomer.log_processing, 'r') as logfile: + with open(groomer.logger.log_processing, 'r') as logfile: log = logfile.read() test_log.write(log) if groomer.debug: - if os.path.exists(groomer.log_debug_err): + if os.path.exists(groomer.logger.log_debug_err): test_log.write(divider.format('ERR LOG')) - with open(groomer.log_debug_err, 'r') as debug_err: + with open(groomer.logger.log_debug_err, 'r') as debug_err: err = debug_err.read() test_log.write(err) - if os.path.exists(groomer.log_debug_out): + if os.path.exists(groomer.logger.log_debug_out): test_log.write(divider.format('OUT LOG')) - with open(groomer.log_debug_out, 'r') as debug_out: + with open(groomer.logger.log_debug_out, 'r') as debug_out: out = debug_out.read() test_log.write(out) From a450fe6b96a557ef344883f3eba1ff8e2feb9556 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 22 Feb 2017 11:29:20 -0500 Subject: [PATCH 09/36] Add config object to filecheck - Grouped all configuration options for filecheck into a Config object - Makes the code easier to read since no longer many references to different configuration globals --- bin/filecheck.py | 188 +++++++++++++++++++++++------------------------ 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 133ef17..3b1a3a4 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -9,75 +9,75 @@ import zipfile import oletools.oleid import olefile import officedissector - import warnings import exifread from PIL import Image # from PIL import PngImagePlugin - from pdfid import PDFiD, cPDFiD from kittengroomer import FileBase, KittenGroomerBase, main + SEVENZ_PATH = '/usr/bin/7z' -# Prepare application/ -mimes_ooxml = ['vnd.openxmlformats-officedocument.'] -mimes_office = ['msword', 'vnd.ms-'] -mimes_libreoffice = ['vnd.oasis.opendocument'] -mimes_rtf = ['rtf', 'richtext'] -mimes_pdf = ['pdf', 'postscript'] -mimes_xml = ['xml'] -mimes_ms = ['dosexec'] -mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop', - 'xz', 'compress', 'gzip', 'tar'] -mimes_data = ['octet-stream'] +class Config: + # Application subtypes (mimetype: 'application/') + mimes_ooxml = ['vnd.openxmlformats-officedocument.'] + mimes_office = ['msword', 'vnd.ms-'] + mimes_libreoffice = ['vnd.oasis.opendocument'] + mimes_rtf = ['rtf', 'richtext'] + mimes_pdf = ['pdf', 'postscript'] + mimes_xml = ['xml'] + mimes_ms = ['dosexec'] + mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop', + 'xz', 'compress', 'gzip', 'tar'] + mimes_data = ['octet-stream'] -# Prepare image/ -mimes_exif = ['image/jpeg', 'image/tiff'] -mimes_png = ['image/png'] + # Image subtypes + mimes_exif = ['image/jpeg', 'image/tiff'] + mimes_png = ['image/png'] -# Mimetypes we can pull metadata from -mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png'] + # Mimetypes with metadata + mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png'] -# Aliases -aliases = { - # Win executables - 'application/x-msdos-program': 'application/x-dosexec', - 'application/x-dosexec': 'application/x-msdos-program', - # Other apps with confusing mimetypes - 'application/rtf': 'text/rtf', -} + # Commonly used malicious extensions + # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ + # https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java + malicious_exts = ( + # Applications + ".exe", ".pif", ".application", ".gadget", ".msi", ".msp", ".com", ".scr", + ".hta", ".cpl", ".msc", ".jar", + # Scripts + ".bat", ".cmd", ".vb", ".vbs", ".vbe", ".js", ".jse", ".ws", ".wsf", + ".wsc", ".wsh", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2", + ".msh", ".msh1", ".msh2", ".mshxml", ".msh1xml", ".msh2xml", + # Shortcuts + ".scf", ".lnk", ".inf", + # Other + ".reg", ".dll", + # Office macro (OOXML with macro enabled) + ".docm", ".dotm", ".xlsm", ".xltm", ".xlam", ".pptm", ".potm", ".ppam", + ".ppsm", ".sldm", + # banned from wirecode + ".asf", ".asx", ".au", ".htm", ".html", ".mht", ".vbs", + ".wax", ".wm", ".wma", ".wmd", ".wmv", ".wmx", ".wmz", ".wvx", + ) -# Sometimes, mimetypes.guess_type is giving unexpected results, such as for the .tar.gz files: -# In [12]: mimetypes.guess_type('toot.tar.gz', strict=False) -# Out[12]: ('application/x-tar', 'gzip') -# It works as expected if you do mimetypes.guess_type('application/gzip', strict=False) -propertype = {'.gz': 'application/gzip'} + # Aliases + aliases = { + # Win executables + 'application/x-msdos-program': 'application/x-dosexec', + 'application/x-dosexec': 'application/x-msdos-program', + # Other apps with confusing mimetypes + 'application/rtf': 'text/rtf', + } -# Commonly used malicious extensions -# Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ -# https://github.com/wiregit/wirecode/blob/master/components/core-settings/src/main/java/org/limewire/core/settings/FilterSettings.java -MAL_EXTS = ( - # Applications - ".exe", ".pif", ".application", ".gadget", ".msi", ".msp", ".com", ".scr", - ".hta", ".cpl", ".msc", ".jar", - # Scripts - ".bat", ".cmd", ".vb", ".vbs", ".vbe", ".js", ".jse", ".ws", ".wsf", - ".wsc", ".wsh", ".ps1", ".ps1xml", ".ps2", ".ps2xml", ".psc1", ".psc2", - ".msh", ".msh1", ".msh2", ".mshxml", ".msh1xml", ".msh2xml", - # Shortcuts - ".scf", ".lnk", ".inf", - # Other - ".reg", ".dll", - # Office macro (OOXML with macro enabled) - ".docm", ".dotm", ".xlsm", ".xltm", ".xlam", ".pptm", ".potm", ".ppam", - ".ppsm", ".sldm", - # banned from wirecode - ".asf", ".asx", ".au", ".htm", ".html", ".mht", ".vbs", - ".wax", ".wm", ".wma", ".wmd", ".wmv", ".wmx", ".wmz", ".wvx", -) + # Sometimes, mimetypes.guess_type gives unexpected results, such as for .tar.gz files: + # In [12]: mimetypes.guess_type('toot.tar.gz', strict=False) + # Out[12]: ('application/x-tar', 'gzip') + # It works as expected if you do mimetypes.guess_type('application/gzip', strict=False) + override_ext = {'.gz': 'application/gzip'} class File(FileBase): @@ -101,7 +101,7 @@ class File(FileBase): self.make_dangerous() if not self.has_extension(): self.make_dangerous() - if self.extension in MAL_EXTS: + if self.extension in Config.malicious_exts: self.log_details.update({'malicious_extension': self.extension}) self.make_dangerous() @@ -111,12 +111,12 @@ class File(FileBase): module's list of valid mimetypes and the expected mimetype based on its extension differs from the mimetype determined by libmagic, then it marks the file as dangerous.""" - if propertype.get(self.extension) is not None: - expected_mimetype = propertype.get(self.extension) + if self.extension in Config.override_ext: + expected_mimetype = Config.override_ext[self.extension] else: expected_mimetype, encoding = mimetypes.guess_type(self.src_path, strict=False) - if aliases.get(expected_mimetype) is not None: - expected_mimetype = aliases.get(expected_mimetype) + if expected_mimetype in Config.aliases: + expected_mimetype = Config.aliases[expected_mimetype] is_known_extension = self.extension in mimetypes.types_map.keys() if is_known_extension and expected_mimetype != self.mimetype: self.log_details.update({'expected_mimetype': expected_mimetype}) @@ -126,8 +126,8 @@ class File(FileBase): """Takes the mimetype (as determined by libmagic) and determines whether the list of extensions that are normally associated with that extension contains the file's actual extension.""" - if aliases.get(self.mimetype) is not None: - mimetype = aliases.get(self.mimetype) + if self.mimetype in Config.aliases: + mimetype = Config.aliases[self.mimetype] else: mimetype = self.mimetype expected_extensions = mimetypes.guess_all_extensions(mimetype, strict=False) @@ -137,7 +137,7 @@ class File(FileBase): self.make_dangerous() def has_metadata(self): - if self.mimetype in mimes_metadata: + if self.mimetype in Config.mimes_metadata: return True return False @@ -151,23 +151,23 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.log_name = self.logger.log subtypes_apps = [ - (mimes_office, self._winoffice), - (mimes_ooxml, self._ooxml), - (mimes_rtf, self.text), - (mimes_libreoffice, self._libreoffice), - (mimes_pdf, self._pdf), - (mimes_xml, self.text), - (mimes_ms, self._executables), - (mimes_compressed, self._archive), - (mimes_data, self._binary_app), + (Config.mimes_office, self._winoffice), + (Config.mimes_ooxml, self._ooxml), + (Config.mimes_rtf, self.text), + (Config.mimes_libreoffice, self._libreoffice), + (Config.mimes_pdf, self._pdf), + (Config.mimes_xml, self.text), + (Config.mimes_ms, self._executables), + (Config.mimes_compressed, self._archive), + (Config.mimes_data, self._binary_app), ] - self.subtypes_application = self._init_subtypes_application(subtypes_apps) + self.app_subtype_methods = self._make_method_dict(subtypes_apps) types_metadata = [ - (mimes_exif, self._metadata_exif), - (mimes_png, self._metadata_png), + (Config.mimes_exif, self._metadata_exif), + (Config.mimes_png, self._metadata_png), ] - self.metadata_processing_options = self._init_subtypes_application(types_metadata) + self.metadata_mimetype_methods = self._make_method_dict(types_metadata) self.mime_processing_options = { 'text': self.text, @@ -183,17 +183,17 @@ class KittenGroomerFileCheck(KittenGroomerBase): } # ##### Helper functions ##### - def _init_subtypes_application(self, subtypes_application): - """Creates a dictionary with the right method based on the sub mime type.""" - subtype_dict = {} - for list_subtypes, func in subtypes_application: - for st in list_subtypes: - subtype_dict[st] = func - return subtype_dict + def _make_method_dict(self, list_of_tuples): + """Returns a dictionary with mimetype: method pairs.""" + dict_to_return = {} + for list_of_subtypes, method in list_of_tuples: + for subtype in list_of_subtypes: + dict_to_return[subtype] = method + return dict_to_return def _print_log(self): """Print the logs related to the current file being processed.""" - # TODO: change name to _write_log + # TODO: change name to _write_log, move to helpers tmp_log = self.logger.log.fields(**self.cur_file.log_details) if self.cur_file.is_dangerous(): tmp_log.warning(self.cur_file.log_string) @@ -205,7 +205,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def _run_process(self, command_string, timeout=None): """Run command_string in a subprocess, wait until it finishes.""" args = shlex.split(command_string) - # TODO: log_debug_err and log_debug are now broken, fix + # TODO: log_debug_err and log_debug are now broken, fix, move to helpers with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout: try: subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout) @@ -250,15 +250,15 @@ class KittenGroomerFileCheck(KittenGroomerBase): # ##### Files that will be converted ###### def text(self): """Process an rtf, ooxml, or plaintext file.""" - for r in mimes_rtf: - if r in self.cur_file.sub_type: + for mt in Config.mimes_rtf: + if mt in self.cur_file.sub_type: self.cur_file.log_string += 'Rich Text file' # TODO: need a way to convert it to plain text self.cur_file.force_ext('.txt') self._safe_copy() return - for o in mimes_ooxml: - if o in self.cur_file.sub_type: + for mt in Config.mimes_ooxml: + if mt in self.cur_file.sub_type: self.cur_file.log_string += 'OOXML File' self._ooxml() return @@ -268,9 +268,9 @@ class KittenGroomerFileCheck(KittenGroomerBase): def application(self): """Processes an application specific file according to its subtype.""" - for subtype, fct in self.subtypes_application.items(): + for subtype, method in self.app_subtype_methods.items(): if subtype in self.cur_file.sub_type: - fct() + method() self.cur_file.log_string += 'Application file' return self.cur_file.log_string += 'Unknown Application file' @@ -401,8 +401,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): extract_command = '{} -p1 x "{}" -o"{}" -bd -aoa'.format(SEVENZ_PATH, self.cur_file.src_path, tmpdir) self._run_process(extract_command) self.recursive_archive_depth += 1 - # Broken so commenting out for now: - # self.tree(tmpdir) + self.logger.tree(tmpdir) self.processdir(tmpdir, self.cur_file.dst_path) self.recursive_archive_depth -= 1 self._safe_rmtree(tmpdir) @@ -488,10 +487,10 @@ class KittenGroomerFileCheck(KittenGroomerBase): def extract_metadata(self): metadata_file_path = self.cur_file.create_metadata_file(".metadata.txt") - # todo: write metadata to file - mime = self.cur_file.mimetype - metadata_processing_method = self.metadata_processing_options.get(mime) + mt = self.cur_file.mimetype + metadata_processing_method = self.metadata_mimetype_methods.get(mt) if metadata_processing_method: + # TODO: should we return metadata and write it here instead of in processing method? metadata_processing_method(metadata_file_path) ####################### @@ -565,6 +564,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def processdir(self, src_dir=None, dst_dir=None): """Main function coordinating file processing.""" + # TODO: do we need defaults here? if src_dir is None: src_dir = self.src_root_dir if dst_dir is None: From 9d2f5c72412b0f6c0639ccb3c0ad7174f5983ef4 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 22 Feb 2017 16:06:51 -0500 Subject: [PATCH 10/36] Add .jpg test file, testfile_directory.md - Also fixed a bug with metadata code --- kittengroomer/helpers.py | 6 +++--- tests/src_valid/Example.jpg | Bin 0 -> 27661 bytes tests/testfile_catalog.md | 12 ++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 tests/src_valid/Example.jpg create mode 100644 tests/testfile_catalog.md diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 1e24140..3455bbd 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -165,9 +165,9 @@ class FileBase(object): self.dst_path + "\", type '" + ext + "': File exists.") else: - # TODO: Uncomment these after object relationships are fixed - # dst_dir_path, filename = os.path.split(self.dst_path) - # self._safe_mkdir(dst_dir_path) + dst_dir_path, filename = os.path.split(self.dst_path) + if not os.path.exists(dst_dir_path): + os.makedirs(dst_dir_path) # TODO: Check extension for leading "." self.metadata_file_path = self.dst_path + ext return self.metadata_file_path diff --git a/tests/src_valid/Example.jpg b/tests/src_valid/Example.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a686d104cd5fcd90f06c6f94c2166c7aea00c11e GIT binary patch literal 27661 zcmcfobx<7L7C4FyF2OCh1lI%t1O|6z@L?bU0ts%x9Rk5gfS@x#aG3!HO>ieTf#8En zaCZ%wn^V84?|iT7y!Yzfdi8c!_aCdf_mbXgt-YlG%=}pfkbzY}ssIcO41oII1Mp`B zK%?SoZvy~mX#w~E0092q^Y8fc0C=qAZf)*qZN=(i?`g*hwX`TzF#^Bq8rkHL#UiG}e5fJu&lMUL^O8^8zv zU}F7c599yv@%M_02f)F{#=`v1T}v_m1{M}378VXJ89pIC76AYQ6AK#$mmENWN699v zpodRI{luJI5{AFX|3kw+6aWBN*ch091@-r)FgXAd69XFq0~ZGy>;F9R zmjrTbioe)#a4DaN=!H<3yCrt8e_J`mqs|-qGY260`y3`Y7CAr;Fe%N2Kq64~{r&0^qj+ZjaDn1|mrFVJV zH~?2fHcEFyH-IEqa4;_F@y9zIX!PO-DrNZ~#G=AzF~P}E*(a2>Db9JhD2}MXuYHN))igi`U34lAjL%||AvuW1Jb#Q*Km0Gs z0Md2^y{VKFhM)Ho3N-`C=SrbF(rF=%Qo1xg@H#4WKsf=E|C-?)Xf?3i{3Q>SLpQMI zi@^zpf+l{;DY6Piu{7#K&C>uw0Q@guAz1*u|1$mG@0pi@^+`0Qsadc7p7M3>)Xo9h z6>9QhUMg&0m||BP1%f=EU_F;<;xC6T>!B!iCyktx!ieEAY**S8a}*-bAT5lI|1~Z0 zUhZ?<*(quu$JrF^pry4gj9U#so|;S7-s?;)K!e*dz~c!X`94q6m4nzH(F9PkqD{kX zI)fC+(#}kDCb2XKIm($zz@h3)s%oYf9{&qP-mS(*XiVWu=>qyAwGWDF>x~iw`)#E< zuEIh?KR7xQ#EO)`(4z(HmVEAMf|jOaDU=ysIeop?>ztIi8IlqB#yd*bXm4y-Udl>A z(TZ5>D-0Gpr>5MOkvg#oI?(w?cSgN-deVt2gI-w4s5kI=_3eB@sm5zXrVefkmA8Jn z2q9#k%7G$}){To%QP*tgo0Hm#s*$ zF5^DUEzU9D`4z){<={{F>5@6?ca@xXRtbKSR%OP)K@ub zh1zzZW{t?r>&*o5nm8)#!V#hPavAhYt$eWGE#@m1A}ZRe@gnYgHUh7lkMB>?o>xjTs*;HCuPp9hOYG$Mox;pc2Tggh^^r701Ef z3DhA9OAzo!0>bG>(*Cu!dMc1G`DVwJ6!eBdwAa&sn#B?b^a}=9U9m&4RUTSKw1!QA z#r?nJWx(57d@2#tbEIyQXI%KhyuRRtc61R!no(Ds;GI`L-!6&x@)j)+RU?rHt~BMX z-5eE?a`?LM1&X1mB@pZw_u z7LL~-7-N4l$r%4M0#nJQ;L zr1>CkZ@cm355Oj->p-vF!B%rnXU{P4>XBTom~AeW#KAFJ(1%;+g4^fmkdx2&Y6;JI z6%`>F&T$aHerJKBsx&oM-S2NOlzLchHI5w*431`v$FDxZ_kKBi!rJ0DkE5;}_{c*w~TlgP3<;}Hur|rbz zxo1U2$ezwepLt8$@}aK^)s@wsdYC-*C2zHc(p3DBhEFp)MRi!jblr9#kzVk zcJ{B=rKX<0{zKgEe6!pkbW*1DT)>p{`8V{(wQF1%LUCPE)7oCOY$PkwB7;_5a zoqNTzXBEt}&33<1dscT^dBGV^kh%%`>en6s-bVyZq&&&t2rG`KU%mG~|5uJY6yZ67 z+%yovX4w*9F|!2hCp zE0i0#Q^k7SJ9Jm!2v)ssWcb!B(LXGu+O|KS=rkxB=+Px3Osll~+0miy!5pN>bLUX;hC zBZE{7Gw*QfK2pB&c;<{ozmkL$qZFun!^*ENQ*vH+K1#-YIm3b3Y_WJ9yar z`k5EIRw3P2{YF-kSUg1{9m}4fA?g+3&io#$eb>MABF|=;|6J}#2B9glmtq$AT)O2{ z$ur|k^)PK{eb^w#7CNxLM=$M6UpkFpJ=Bsy`7Y=v3gBsA={k)D zhG0PD+de4Pbneu@zLWgS;|RP6pHwoZy+b853T~3F<&bu>+OXLGVcl)E9- zdC`KNeq34HsZKwD9t$|l=~}X02UltMJ>BzIGx(Qm7#qIG_$ri&L7c7{oDFaIr0io~|FSj(oGdJN8f*e?LSu@u6p|4W08->?jxO0iG%H z2cTvce@gt!ulbuoMqSU4`5*|vM`wR#?W=}p2=7aJ@yCCh&nG@NxcEqd_etO z93HClkQ0x%soFT?MBX33v_UTT{T#4Wb@HL9B`}P;kA4cGXJljfm^+-RGUD#36OQA` zMlK9tRXkP@jCY%E^y>v)vJ+~^dQW?#c+jQxqp005<8Nvwg6mJRLmWa3@i8rCD4LjY zhQj1=Q45pUkb}N9*>R{^Tx>(&`YrJov$Z>T1G(-Y)4w7!gPq;#aVaFSO&*^Tr-%`CSkFU-`LlVh?SiEAxX=aY@ax|o1HyL|{HA9s${z0g= zwX|!1nv0(?9cnisu5O0LYNw+c(GMH_z5OL!^HpMshBP|Q!HXAvASwuOBugLf5m@b+ zv_spO)5)KxDY16BE*k7wy?N3*F?!K2E0N@|f^7kK4B#M#rhlk}OFT=#L|{ zqxnq4o&CO?cuh(Q>4uHLxM}Xzj$>EO8SdmmvE^-gj)ocIA3&OT<%JTw=fQ(7ed0G>I$*^QKI%x~3Fb|wrI^!0dnJJ&u*^ua^_-?;hFqnQ6M_|Z4U z$6I$DwI51oA2y>eL;*$rpQZ-zp4YC7T!%UQ0YHWChy1ix>PaRDF1R{tLx_t*d5JuC z?uls~dq14eu{gZk1Y%8u{lZT1S)!GzE>JP1UcTz!%({vs-QONF3?5qe#|WU;HJSTgP=K1&Uq7sR3;FPDx~+w`eRcbn zgU%-X)hMMIH9;cpads9^Nv}X8wBFQBfmcz@kNb_7(J=}-OU)OXlHVOW>$a6?ND&@4 zuKn$b$~XSy>ixJ=sc*{=Ms1vlJvN^L-mk`$wSDGA6{qIMn;HgYCl4xpbiNxV&D@d> zt`|?M33}X91qs+@nOS|1GObcNeoU&3zSiF-X(sd_bC?Qf7Xqq7vKMx% zoLUB}x;j@`?=y%Ek?n*sqZahVZoLIhC_t~nw^%s=UJ_U`U4SIu?rqd#QHJy!v2E8$ zydkbey8dA@CMS@L55g0~ksPekw}6xUe!N0M(u>CLn?sGHK)Wmz>>x-^u=~s%=*ja7 zP4wP0y{xe?j@iwdX&~Ms*A(a3uIUdO-J6V}1}jgw119S>zx`?Ua#qVXt$>4RLQalL z9?LgDuB1U;EG??HDjPwT;ySdehGwqML5-HQRWN@x3S&Aa+Z)h8ke2?~T0_&mH3n`OOWR6qO=paIUkrpt6w zwi4Lgv*M2C9}1@C3e(tR{lNBp{d>(_G`7QVx>1sKK+2O*L-w~9*pB;Zbi0YA!E>Z- z+U?SdOmo|Zd~jT5RW&WYMk2x15~ijWq9U2xy$#J!kNRpOh!`ff@@sqIBTh7Pk7e$q zW5E!`nYlSFcRPJe!|S*~!8`|W=oICN)!s0JqqVA3m}_5gxl2Q*3i}C3z_7HZoL(V9 znk9y9MxXv7!wX535Tr%l*0BkvkngjNE9F$s?2h0#V7*v)<|Qi7#hxh}p;gf1^D4c> zv=`R?%>=tJlKNphfb>Uvs#dh;Hk=j($5nEvs_0>TP+AU*Qg3b>bSO-H{mz}6Q&I`F zlf0P2^zwMlX-qB9KB4GITf{pyC0<0+>J11leem2?BoHNmP4`seyem#t$I zU$IC+=o)BC_bppI!%* zbbkQdR)MHDY1^dQr0*$=e)+zA82(Ic_wwJWjP=4TXGN&|Hw(HzU;kf@9pm2q-eFvJ z#r7PxM55~$-Y-yAO~j}#sfnXg_Ycm9A?9rNEIeBoTTO*kR%fVJamEo5h(y(pIO zx<7DoS2wekUN=cBx06^HqYFDNzX@)geks)(2sx%ZWLlfgiGBK`u!pCpuw9H$86{sL z0$Vx=WBz^hKAV2bbFz*g8d--TX5;f@WhC0do6TfSN2Xj`FHwM)-ifA+!Vqsc-a4La zVM5$0Yx%JZCbC)II)XIt2jW!PX(jv@$0uqXsB)ASc&)X#z(6hytG6}V%-0;ABR=)j z3Mq?|;MMZ@r%_jYS|TgtjqmH*k7(lk&#W$2KtGsTTMH_3YIZHF(}j4^RihFdm)jOD z*kyAyRL{kB98#GbC)kGKwyl5P^*zdfATH*p3g*lAD!he)y1liZy$|fD0GLAd47+NK z=b|0YA8XQW>s`R>^Iu=ClPsN*qzkUm^@j3D3q`54Wi_)@48bKgO{^_s>6H_Laj$}h zH>^m83X074Ql5}SaPz8)Q77mncckeEW79p=X-A(fYihoF1{)-mn3_WGP9LTXKMxKE z$oCm|jj|BfIn7CfR~eq>c1Iq48T=SUlU$1|dHEJ3!4UdY$}Kjea8a>TCybXI0cV@W z$H%}vF||VFZtxVL=e>zNG4M@vR1(km5Rwl&5aRqnyllre8^7{e5qrqIf7@I3 z^`om2h&uhL@|E98?boCanb||$qG1!RV1);DCb8d$XC(4CK|`L=zS5g?95{Ut3h^9h z+{DRl#zI**pyvFUOrsYt*kSe1RH*OlCUaj7$Br=c(i{2^ZGoc?4J^y-BK*1E<{_Zgno&^(y2|CAr#U}itxm^_2iX< z!{5!c{sq>~^^wNm=qbIw_CbuUQEW}_cWitu{~1%j|KFTDhCe!mZFEdXA7vlOj>qJt zhCu*!2VDZh!geWb-EuyU^J(Kr|8Uu;;eXf_AtB3&jkTRaGTF8Bn>25`gnlf1wxZdz ztgIaD9~r8cB}Y!H@Q|Gnq%k#Mly?Ifc61&#oLY3E)pz?>Q}<>ZFHtcwjcb;<8O`+wEKKzNhhdcD_(xUw1Yrfj$o|`5u9J8w{WT! zji-p`pysEJS5@yk!CwQN;E%mN*9)yfdq+`WV0kZid!HX+$y4|WE83qrm2C7lOXw(f z6!WK^B8~d5y>RuOkAxJb&if&N6QA5_5xjh$eqEVxrk9$o@Jz46anpgPGy-2-?Wv@O zGa!YnjpR6n%-x(nlj!(Vp(4-gD}a_o^8O4h4Yim^dK{OiZ!E8i!UTUtNG&MF`zP=5 zaMvVH6qe4?+}NBXndW`(T$;lZeuc+sn>$%fB#a?qI3X{?8L;RD=Hr(M@}Wa&tm}tkJOn!{e;6oG9Yl0k>*ry4$+2! z$y0uZdJGt1jdd^YDRl!#d*L$ne0qDL(NOf-9SoN z65v(sc}!0-703d2c@Wn;Ep3<;bf)zNZb=RaVd@}tNP_U3)H&L| z#=Q=~FR`mIy7)Lz`{U*IC-a`tNpO4_h^ffzC3Tl>k&;mev3}MJVbWZ%p<1LDBHO6x z6@CJ&eB;^YK3luRY2b1RZyK%NHA&r!40XLL-uw#kUVJaZBygH{SCJ_m)=CHF@#MdM zx4RnU)v}sQQ?B98b5&vtZ{#+bnGjouCxlz9IatE!1KxqGk%1tR@YIO|yu!c&rRs)Q zbaZ!dd=z^-j1gq->O22Lj2U6ntsrd#f5-u%I)FMes4`1sRct2H573cDL_F&osHiY zqH9Ffo@yq^?=xS&jevyE5YH;}B@}BP5*-dYC`Os;&Sx4Tp&Rwu4ZMR$upfZLB5-IV zRdY_1R5>I=jNG1J@~=*s7+H*Qwe z3m+`YAqP##*`GCg&9$sHYM%1Ek|RH$ms`0QVngxNJ4Q1u3zMk2r>dQKtgW2++DgAM zL`>=@zefHD>8*Wfz;H*6}R zd!X3;6><+26UDljxS2!*c8+WlL*1g5lTL2Cq%OU7m;M)Yn<#H1L4!o?F-n+bNTtAlc%Jg5ADg@;S@(BPl85yY((Y)L>WUdN_tPX|!7VEBB z4lDX#@=WqI3E10yf*((FFv~*fzfxs-0);i`WB|;I{^Fx}t7vdL(L5(7XxE7!op*k_^T`zb^#c?2qkVz<(TIf;c&f1*?GMv(xSKl}1lt65y|jI&Vh>>KqI+ zYulBb^ml&f2~B*u$T-o*zWgA6e%u#0NnelIh_4yODjll+ik(ogdHg@s)FZr#!lk(m z=AO*#QyN@o0s3<9xm$&SLtf8rCcky64XLVvjSDMQ^>jjR1atQ~50ljI9B5FgGbowX zy1OnvCzjauDOdFR53zt=B#)dB6y9U50Br z>N0FleWObIqDsdVXx-_ZtQy=XNCR8hu4QY;%=`Os8 z)avStmT^9EZ`URgR@YY%fyBS`KTAOTwwwATf*LhOlqg!U@om)IeBx(kqC9f|nIpzZ zw7fmP`$gJuMiv2QHIQ~p2#!l)rm+-snq5LJJSl@ z^b`bGh|J~pIH?g5Fw}W7sYy$GiGTZ;Y`fUMqUz3*-YPPh0|&v&rc`2(=SM^J^c?}Pp)vP2p|vv}qW5oocZrn^Mwgo3%*++;rc6)JH=X3%sjAJvuj5*4|Jjmy z#1s|%+b;5V9EEB~9TBn2j^&qQWHkI!r9<_82yd!i{s8%`1$d}=sI_Ntt-irBU^M^D z(Gqk#e4piC|6i!I8AF4XFSo=H;ooF&zgciswssl8$n_Y5erL4TmpK0FC@cp5vv8jARXhOnSF4janaSlc~0r(|( z-UOZUKwv`km=lt6U6ewq)l9KztKEQ+V-VaQ1#4;rB0?J|W)fhT{1zQNvV=^zwVj*m|(%np6? zk@1pOJswUPzcpsS#q>={X0Uf5-t3dYIl`wTDQhF2UmK?(J3J|uv;=@xuWW}^kuo(x za%kdO)1;F6w$BUctdWo6Xv`!}s-`VI=dRgnJ1#pBpUG=}mFQ$Vo@Or(q%CcO2Kzg+ z_Axz=&*=gNpnM5u?o|nu9OM^V<#&qv+P0sEA*i3m)z0#zEJRb+<4lp!%)KdTKYU=L zjq{Jkid*`!7t`yT=T+a;|F~t>GzT|F21hU5yOyJE1^aUzZ>+n&4()tVPthMWd(rgFxm?b+ywX%{a9aJL9j-0&Qu&CS>}q6Wz@{`|M`G;MB`8 zH^K|L`J$9XBkP)=pM%slv_W9?vRgouqF06Lx214uy;pH`%j72kn>BOrt%}**7Qwh^O#551A zP_}lG!z8sj>^NEmX|?#$C7Q58zEK4HMFdC4l5oXK`ydw`2Y%WMhE4I)8*|W!tW48U zy@P1CNp8RO#|wHtB9M9byScKp5*<64CWbbyd==bELTELO5;45ewgs$Xe%?t=#l9VdgbdW0V&k=3BfT=LgE$L&ohu$+506Cr7^jmLr4D`qp znZz-9WS)}4BPra~Y;=z3#)!c#T)#v+AC*OdA^B~H?1sIAQDlK=A!RbXuixskv<-&~ z!T|MSy!yh{B6Rl;eAn{){5Xq*ys&uE`wK3SpVk~f$KsT*j3f)Io>vsF^prCEu`(~W z>$WP|DK4Uj-GYmnre8+Il?dLHe%7(yNTDYdN0L~4#m6d)JdAr%>u&1GTjNXA_N`V& zu*wxlvbIm3*fA@9Y~S=OS_!#blD6-4NT|K|J@&Vq@p&|T<}~lp>zbm>1Rzgvfc^m< zo|^Z$#Ufjl!RVe;0Zkmm7}cwqwwyZ z+@-#ren}sNvEbELh>V2HF~~M>CF~X`YLI94uRNv$rse-Ha+kP{2w+3S4uRTQGUVzBY1cFWF7YAN9tJs&GDG-knUITax+fwt|ep*b5?uO)XAc zG>rxyAIQTUw5vX+)7lsH+7qo4=|q0wFrZP>&7W?{2G(B9Dyo-Jt)$qV18!4H6r=d{ z9~rSJvHN1Yy>%Onx0|*eG&{EnJd}WOTqVa3mKDA`rfC#R=_^}R8_X>|JTA;9iq;Vn zi7kYUf59tBQYCqhD>WQa`dCX5G|MPh!*f{-OFpr!TxVe}?%imC=0Xi~)ercls;U|3 zW)n#R^vSindL0JcW=#~6DS(~c1%S|6GXI>-kQ_T1G4Rmzj+Fmrl6RN%4jE*s@r*6c z#uY!w5FBtM>GZ4=O@apt3>~!5)Y{Z|odZ+aO7?tvu_Prjr1mmVh1Eqg`vVc*UTe;h zQ=|u)YX1n|WgvJLyqZ+k3&czkj>k;?Rq-n?;3=uYQ-AIcO}+zbQs35UZ|>vTb8l7e zGtsTD?G44h4fa0YD!K{mRl{Bv=G%sEp3-~q#K2(!{LEi_-ToGAwPcF($jLa0*e-Zb zkA}tHn=M*1V?ujmcN3Yo31Q#C5WF@v`K$L$W1os!2k-;jsnoQ(ri_jWjxQZob57q> z8~Wo_&gL3A&lH?k(*1-s4em)Cb!?Ykfc($=uN72AYBSduKVZc~))J7QTyt_5?qu{B`!YwhhM`_-aZ(_6Q@*)Z;~ot~o1%)a5S-M$BawpO@{P zovy3!)JktCO?z87y3 z!8cV)fjBRzH9DPxy=bOJx|pQxQZ^T9X=vzuAV(%Hj^;*o)nwE5mk4me_-y)75Ov6@ z%&AVb($Mr?-Oue+Vr_jRdm+yOkLJqO37uGT_XaPGBGuq-Z?@(k9&q&!g?*6B7FeSW zh1=94`L3$2L_lq$pL?l{dueTbS~2Iddo%A$QfeT*_h|dZB#OusM)ywWYp00?RK>~z zniy=b;4b>BNUOd9<#gcjg(82BkjdJtWd4m`#{u0G)AG;`;Z&J={Hr*W=a>Esbti5@ z&;j^wrbg$JGhdeEQ++;>Ny5q~I#d;5R=E0wwweQ}Ia43BmdYuI#bU$TMmp=U8j(;F z6KF)l+(4aBK^3dO-DA%C@@|f}jLS(Jvf8eZjo@U+GAW~iWiMEMw;L$xsd!?<@01ap zg}E6#CD-}t!c6^8kNs>fuBdPRiHD+64PV9~?B=IrO~EQR{mY~GeR6qJ3y(9YXqQWGYq=V4Dq*NzR4w17GwLspI=D($ah@=V6QlrEar9mrvh-_i#hzF=XEy+ ze|GssGHVr!!&!9>XZ*nGZQEeSIS54TgRYn~N36qdOWvhk#a*1WnRc%A_31{Yj~BDnGibN%x@W*m=8QRQKP~=N?%u9|38gL{uQTSNc9jZ78;K^ykmc=D9_(#|1KmaZTZe<{9&oIf+L*S z)eZ~J6W4KiX}?2<@0~fX%^~>6iW9`n8VdZ(R(b0*?_X6%2rU-Mwz9s(7?u^y*_XMA zvm3Cq`DAF)_ zJAn-R3~>iCS?GZaiZ^H_b-6i`-h=iE=46b>kN2%P1u4{ZX~8Wz4-dmjU&sq**$GhI zu~yj4koPSWlIu669Mg-iX2yCN_yS9+b22X!wf(dQ=zz={YH8)NYZ>@t#KZiD(( zy9p=b@NR)9vM~*{doO>~9l}vUovXt^1qAJbhGyOtLg7RaUU+!36V)2&5gYCcmpr)! z8Fd>IG;0;2KcgY&I`nI!0mIZJh^mfW*r{DBB`5ypaR&tv{_+uJYl&L_0eegD(*SK; zwKo`ubHa-!ViGFe9_nuJ1~{S=o2>q%oBE)xgyMv&X2ErX)Y!`2&@>7AHBYZ2Y@0q#OCrOB<*i&_S;6K`$P zy2iDLoGKaZ)QP~ypun-%@z7qH2jJjY>MykWxJ0eT6U4&h>e&00ngb_|*wxU$+{eN> z&Z4uYfsMP<@LzuD-3P}lMQX9c%>x*_9kdEa;lG}JZ=EikyX#XmzJT>j*-2;E3vu_w zqGCSrO`^UTa|XC@NEhoHzqg$>UgT)<>8_6I`L^9wRq??a%|8GYJ0 zeaY+bgWS62sDygAm^mAGo|d$c&BA;9M53Ux|7^9DkNvII;-Pinv1@>olxMU9r}0GE z!gVt9ySfv`d~oPT_B^cO;d;;C5u50Sy}1@EpD?7#ns%Og>Jywt+C{+CG8C_{d)!H<_J z5UT*pgEX9+F0;l#n zIu1rIlREYE=fMkmAw$tAMW&CWAYq}E$<3xf!+sby*G9*dRt?qp=7-KN5RFfS;J6o7 zm!$DJ^bF5=MR5=MGP(_Ty9tOhE2O|ujebXxSX-lj?hdkTj%HAs@r0CX^uZcodtT30F?2OlV&4r51iSN|CDZ-3Ct}ts64# zSmESk90)hb=;9rBx=o6I^3EGU-3Kw_0e8@{%Y>6puTA>)%$|CyFU`I;C35VFyps(+ z$`t7?uKI-Y$b9L`L%i$n^RgD}nh73ctx1M3N`X?vF9Uw8?{QNBr4vT>9y&UrTFi}$J$H9L*uR3iJLtf7!WpG!#aSDk zR>1HHJNbmnpK%R~$@8uP!fKc+M6D9M2i}*72A$wF?C|>^ua>LwCu%E}JPQCRy05C; z{5bmEw~p4P-Jtv7g;^Gro3Bs8D_a3g)vr}re;6VCBvEY&qF_sLP^l-IeMBt%JSjPI zYc_7_B;FZzTq4v9A*+OPfBoCV)abvOR3jeFCWK=fq~>C)Jr;YhS8n_~VbYy9{x$Km zsr~v-`&!oUVyYwzJoY}kdYN>M&UoUvxB6e#B%t*ohT~A3ii%UwDf?ZI^9#XxKyowt zRG)U&wP@mJ0v-M93bb`NwLrKyBq6ckeE4~VTB({?U#7<$^YqM@H3J*-=9Rm)(p^KH zc;egg;)Cwk6w4Z9157l5dIF2x^xGGmXSlOY-yZTYKZuv+>CM7SlUCQ62Ippl7-4az zK{70^<6UJ7UZVPT?|^&ZpLgG&=XB0e0-`pKhx4w$?y)bVk@1Z?Y~^rjqhc>4-3x)j z>Ntwy7J;kv<`tZbCr8Z|5Myw+RlHojlLB> z!{=npBk7;Rl@cyz>%67^Vlj%1@9VGM@Vb#xLcippJ8tD3u_x$f+2yWApXDc5uI}&& zpBDs@a^KHctW?+m8bGHWe%X%5ATE4%DSykVU);5~U*eTalspz>q6UWhqJr0MDNbKA z_D=b8yB+k#^caaJpWE^pe6Awv6KIJ!(P~u+~ri@iarPMEivLr-JOovT@-<2c{ ztD265IhwwazdsK(2^N`)y~gNaEH&Q;H$RxV2{v*kmz!n%uI%F@lP=FIReT%C!l z3B$O1%kj6>^0ysj^yIC#r^JcSoRW)*wGzcMXq~iT@UA?tG0a`5H9v&B&tm$maHAF_ z42b8VefF8qkhi*2FtVg-#$1avg0pv^5hb&w+ox9t%I_R^{%QRk&wKVx>df25KUP8C zWQ;F!^LXnk;*x(hm)kB=!j#Fw?n-mBgvU&e8H{DkJ7QK)Z~F2ePiPAMgu|XxD2o-R z1~Lor&u$oRGXcj4EAW#5{P}4|`n`5V8>%E1{K{1wleOq$B930QE3M*aZx-o*-t9YW z4}kFMu`jnzKar%TRkVUCRqRE)k=Z6aPRq;%*qCC`+mqJHsXrCTssb?M0J1RB2yH*s^b+I!IB-!^fE8n_q))1FR@Aj4dc!H**9||L#Vx!AnZm#J>g%C_wf(V}3{;g{K!A(rHg@$qU zupg46h;}9ye`;#GPNls+ntteXenPDN-!&4!j*~cw*{(l;(YFtI^t{t>8G`?>Vae;) z;G&b2MqJ+oi4vMK^=Euxwq7mP@X0IcWlbpt zNRM@yIT~;dtX-)mL)G7lMc3jc8fkGUQq|f1Fek3roTN&SanEC@_c#{ z-p#Lg{_+oifAXciivxVbfxRO;3fqG!ICiU9^122yWw~zsxsLZ=6yKkEJ6g@EX7AZJepH7cg8GZyy%tDpOz7eOvd%G zje4)Hcp1a|t;Q#H_2N2v66bmNdkyE5k5%vZ+8%OY{U6l~D050b*wk>u^Td#dZJpEc zQTGq^3~4;cEU}0|I;cY+&(wGEWm=;&OPtKFYivtoLrZ#10_v0#B1dl<05=6eW7!>l zE8@-ddPd3uy+=5$Tule;$*48=u6v>(JJ-+K(M~?ZsebBU;RjOUXONs2f237>0*iB& z_rYtEn)Qk87S5(TRh?54GLeE+oh)m$IWHA|KjoPzQ@0Y%3GGH+1MYN&* zgb%~o`!vni+zMyD?w#9!Q8fJyuj#GFIZeY)O4`d7J^|# zDwC=Xnb8gqsf+$T66K6)${*Cwy}=*09AF0&QskgJw1E3TbMzB!#A#^GxBny zr8DBbgvgZGc&zrq>LHgQPHi04K!uN!4nc_i~<9Y_NJiUN^C!$!pJgXJcc&l>5^6 zldJ*>pM;Br%8?1K@-NLhustUycYU7XSN@g=vQhfU*cQ zYTl$|@0u^^k;X3=>8o@x+=vB`Qyu$DbHTo%LhU(Z{GH^5hy-4b?`DveQ*=!|>vwm3 zUCt4R@FBV4qv*ZGRuGd^Ku{oG;kEKBR~I&$;Rv_BhM4lIboZTTA(SH^j~zQMPd={o zX8K|H&{yf$_2!$|9{|4NaF1VOU5^(+kgO?pufkSZh?6a=IQQltfGX>=46fzTv0K|ll| zy(iSrBq4+@z3N}2BM=}I2_U^=0YSxfIA`X4p7+B$bDo*^{&dgWU-qood#!t|eO*5( z!$R$@tgLYtyG!5mz2}#p3pyRX7iNw-+1Dzj2?TO(MHCjSD;KI^KdIE@d$2#1havYn z_?v@?KuN>qH8h)89Ak2ti!^k6<|;n8>Gg;9a}mn9BWScHF_otp5!7G-2u^}jR=$S^vW4Kw#7+Or&4a7%lZwDI+o zWjDu}h(|2VrtHQ)?d{HnHg2Ok7JTr7R>y^7z84Vw;;@mmu%~Ng9a9iGQ=ko#5EgO^T!Y}g!USwj#Qp?0>NWRD$Tm`>Ceee4? z2I8xZSL&BroK!sxcEqWR2o3n z+9<-H`)B~Onq(h&(nXS5edR94O%+H(KTqA1D*0_*&nEnBc-PWbWq2}YC@%S0TjPD% zZ`7$9g4#e-m7un^?^gP)+^$RU>ml?G_6(f%`L=+}leMq&9$bn(iqwzo) z@qMrJXEskg=7WaGvS*vn>u3K)DJWlQ$|tkRD^y}f9IEwUAY_DL+g;JXFUa2^c~~h5pcyxbaVwp!BN`v2}&m;)4mWNFKeIQKQxKFG2Iz-fp=WM#)}fnwF8) zwd~ACVe7?75JjO!CuE!DX8c!YdFQY7zv=lXRAX0S3%ZAk8r^?TR_|uFLo55kK|vvB zxh0`q${Xetr9UhNaX`D!V?~1`4!bWvD+>Fk6*{9&a3eySoizNMYsH2b-6{%dMoS5mZbmTGBhye2CQ++Wn5YM z{#ngVb>A=Ptd+($r(o4SnUO3CEf-A56M?@9i-e5}=4 z4PjkvTUJr#3H@Y=4)}qEbd%sOX@L(u5PaF8k=Hjb^B-3oCJ!R%>{UV!$Yj&l2b%NG z1)`4idSWUSmNE@Ds2SHwErc36eM+ zEnUc2xI}Tes-Pp`ab!Zv8|gV(`?=kKQZY%M`wV?MHZ9dI*{FDLp*z9H<2ZAYN7>TC z^||IYqdQI;d{g`|qp>aFcEVtJ*K;9zk9+*0vLpxbo+0n2O+jsvrmo&I2_3BSFCGS-o>^zVlpDPx{+R-!lHZg_{>H1zsqs$b{qaB!nNCGtG>U{ z?7JXfp)vQZ4=*%rUun&RY0=E{sb)Mg%>@C!rw7$8)#|%#G_`=-Osq}i%T1u@o2+__ zPmImsvpUbK&^yq|z+rrx=XJ(8H+Y`&=4XA1;IYnfb9kHfTiu{Xn{!DIZJWI8h1pRB zyytS1juC4LkAA{m5HER7J{_s;Jq0hnovuj!R6+9<9+U{{i&lG8oa8GMIZLvcVB|g& z^xGxhIy_!et1~U&+(xg!mr{SCJZEN2GoelL_hLRq#EJoG4%IO>xVM5I&P!uX7sqzR zgEr2|54bc9+`Dw*pm1_@o#Y!Que=?oDasC37DVJ)NwNPOC>+ zy{yh#R$CL{F4fU(hueM~^YI$OkbH?X-TN1p?z4P1EHg1XILtyuMR%|HeKC5r_vP!% zCq~|MG*`}FfPR+(t#u~=G7v|}e7%mEFIp_LDC;&N(Jsp>{a)Dib{Ln8yKznd&ER06 z3s?_g(7pc`aJAh>dYFDfvNyd?Su+R2ZSxB_HT$;Fp)|)Kw5ekx-k)6k;NfNQ+88E@siZvfC%=_)pCB5o47RYQrV}2 z)6KF~ni&?61*R?LIs(Z@Ylf{wo+R|O5LqywI<@dg6SKh|KKFU_anpN)cxVBs3r_zG zI1?Qk@&1_J}htR*$MRj&wIR3}`+#e?t_Y2lb>!1(Q z(5D;Q2;^%Hq1~dpTstU#V~|}F7Jk-a%GE}#EXziw@JG$)Z%r@XW-Yx9X+N||>9jZ5 z%i5nlC=Q?eNb7I5Xhj9?5*4$AZm7YH4amw)Fj~PFj zpHeLK{XAT%v2)BpU##e`rx5h*z_4aOzygAOF=i^)wbyGO4m2HH>6YPL@hJWw!mzou zrW#`meW6#ClwEDy4&4SG?I>^tI5a*P0XD-@e)tdtDYiE{{nJK zz(KLPm?{M-#AMSg~#vaIB3=>FIDTJi*3q=D+~fe2M;UGj%Phu z8K_N+DLd@lBv|WKF>2?O#_ZHDI%UKb(XMXbVWSDEX3o~d+TX2Z6uSJj#?e02YP4`NJXZmD^QXt$5*hlU{AK1xH?+?6WoP+<^K%_x?Emf9#z@C+bI8;_p)Fo zqE4M;@K9UB`fUOuf7j8XkCqW&`vv+;I4t2jv})C1mFHt;qpFM*%bc3l*o%(e2V%fn ze(z_y@~Jcj8q6_uTZ()enb(6imVlMAwgm zeH>b9Jjb9}FCWX5&WCelpPUQeIqOuH2cW0<)B+Q4{`b%l=*gdeyXut ztw&UcO+kYHizmY!qIqkL7L>@YN;B_(J%o`DrGK-xL*xGffDO>MQPKel9MGPImI3Rc zce_iQ<%>^^zIHY&pBEEu&+am*y}VS`2%;w1zr{&IaaQ|MGXT_gTV1--8D z;I67(59kWyJQo?8`>?vL<_J4iIbfztkKCPYHYWlhfW=sXjEj~Lvl5ZSS%I@@%lwT$ zqWsUU8^fQI)LXKgha@R4vY@bPn&HbBmoaR2zivzx! zZx)meL{0?fg`*Qz8!eR1%Z>7U&rG#1-bAs94*<}NC3j)DF#hki|$yqY8GwyvG7QHU=a z6mE1qn%iY|u@CLgvUaEQ(d-U9_ye*Zr+MF-MQ}N@K{+BuUld%D8}gdB@(9SDCKc2Z z-J4T}&6&F+|3;QwdPeu|wFiM?A3p~<1<6rvKFA3`9#q!&;?$zvU9i+Euk}yd6%Vyk zDW^fO>C66QT`8o8Y!^i6u=zwKXntaTvVr9GE_O<9{gXcy8@+0Twql=TkF&IWoDy)m zt>%Y+$#8-banQ8{SIJ*v=3|nHrAInQthh+<`Qrfr1kPY;w?~&uV^P_}<-~b0GW6FF z_Pi*mhkU+W&jC$utK-?(#ysxG>13-Gfz0X!9dJ{oF0_Je`5j6$%dsNiRDn3GhSgB~NQu@VDC?KZ3PrlR;U2pCtxb9~fj=dMB38q4d`R zfPMiCi^?hE-Ymh|5oN6+8mlU7lXXiMhmyDq_6=%GgJCI6MA=x8a4gf6XR*;lSq<%7 zhNf+fldBgEPqPAN6e}+_7H*=~q)H}&!kdALE|PXya7@fpsVPhS-h5sF4FQ4YH><4> z%t?J9K3U^@T5vA0*bAaG2YrOEqB z;|OtlW6`LS^Gn|_vG=_oT$OG*!K|DziEFmlBHp_t$>-?Vpw_kLjV&y+yiL!otLruK z(Jc5SB)G<;Zn9EkGbF&RFPBzb<}us~mLc2=($H4j((eSLjw(=b=!wcyFo!)qSAeCU zER*B|#)@^B8bHx1VXMhYn*+U7Sru>5Xx_g2S(cr#9)!(a)0B7(N4R^g!n-?oe)7TKYg*oI7wQeJ zh*l8+lhQk6c2CP@;ij{(JROG>2x1u=xKf6ca12C@zqq4V-`v%6X#=S4-zqTqS%6Lg z%r3mIJ}AQz$b$DOpTpfne@OA4_)x7{;@V1c2Ot~HZqx9ay{f!H$d_UBk}mX>lTik!X^WNs|XCh^H0t`&A4^v~W4F>Zu+foqjbWQ|VV zSnlbH`WpTlf4kKp?By7J)I}kp_*i>SgQf>-)y#YIVx5nwH?4u2COEnDt@a<5rtBtGyR3sV-c?94rR`$a=i%E?1^tevT_)IQZv2pP`OOyOycT>u zeOIt6ti)aB-tO>cORlU9I)74vj@u?%E3h=pWNTCdKV&x+;q0ru8R6|m`%ogLx|hl+ z(yU7uM(Ka39YM<=&OCjk9xek3b}~F_O{ElZLg0XPK=ogMU)X)huM0E!>(P55e9xRgF{{ZIYTt3MWF4;Po} zs3##FoVQATm^+99>-TVNl!rfJRkWamakC&eMwS7cly-&eHIIGP9uajeN(IxYC?w~9 znF*~GJT3KFg!^dM7p373;p%+ZPTuP>B_$XFLuYN<_X_WEVk_UPBWp^ulz=B3J#y*HedOTjA`HTJ2O}_m@nC=>P)vH?2D)*nqtG-D@NYnt;NvZ z7Mcg+vvxCkVFHPpcKA5=l^9m)CTXO-NW8W&=&2~jkvoOj4!3G1e8=$4S+wnb4Krs*YJ!+^6>#NQ zx{g#Iy7sz(RhD*?1Zv180Jqy&2XeZ~HSGIlMO4B<`o)L}oUQpC7j6OzUmTB6dI=j~ zIe9tbfLiM)ka(pd&>sKFU|P=cp;V}sjgp6_Z-tx0vskspS+U#$2`|OB?u7>Ryg}3& zTcKGB`1(XuJ9%t4>cPRyUpzle3}~V1pdsM5ww{K=6=D^n@a@mLlQ_5&2cg!W=`@q! z9SQYg6o4h8pqbf@;AoD~BMo4`YVhDg2{r!3@`}nPL;5X1(*-TAY2FM0E1L5eI6o1-Is;6p{|o3~GQK@@rDb9fPf){_v9#TXB;1-qJp`7g;on3`XXs z9-~L_Y}2+-^FO;}0Z1ajA~}4u3WVOMV3hZ9v(VWs-0Oz0nY`bvk`Z-t80ZJ*&hAzr z61(%PG8ciI&ARQBdTm5o$+}^5a9h`bx)171aU1Nl`=|I3LLsoDu+t*%PTj}~&hER_ zbg_E!=`@?E-$8=)z}M$d{x-GhDyT*JEFhU9vK>5_0iTCLF{N?hvJcXA=O^QI!!oAC z4DlB-?)CbL%D?MNo(ygo0mn)*{38>{VAqiMwdwT!qq-em+oMmKIa>>bXAH5g-pQd` zXDqbE;-`CSd5e`x7yh}ONUZ&hJZc8oyhR8(R$lW|=68yqybf{M$dh~Gb)40iGxCPr z#DoFhseoRKajLMn*Nf5@eF4|LB-4S7YSjR1WAEWfUSwf)4HlBi4b68!cZtOlenOEc zCtYt*#3is2IXodpS>N1 z_~DTFa}L~-`AS5K40@IAT9Y8AO$YxT4B478h@5d`k`F>bp0jS7&^;p)akmm{bE~IO zExRrgZVoVClT7=LW+U?^u$O1W5rEw6pN(B6`KwdT`Ixg-9>RjPHP_*&@G|8xE}_BeBk$7G^B0_k z0$Va)6g^zcjKvd3B;Pi#8(+U4x5`sHBk~$`68QDK3U4-G`0wzK3`;f(M6nrB*ml}3 zt#31onqLuHHK-ZJ*yWDA1ZLEno{j4Te1h~1HQ`7kUl)Bz<^#rzCV6>2Eu{w=KVTT< zmPJ&8B?;l_zcPGIB!AImW<$M&CQ7|#^KNM>VJ@2riPjHj^I%vsSM&_OlLaHT#JB92 z`wwrG!r0BmjXktCUj-wPdu&1S>uyeVj73pA`YDFD5{@f^H*T7}(1zE+L8>LvYH``m zFT`p02jh%VcT*|lR)QmRPL)>LB{zapDp!qp6~1^6FC_WKu6uqyxbB$VKK9(1bmZ_P zQv>3oH^5i!2+~i~eVNIQjZiDAioDz`Lif8`r)Aahe1j$cUj0$oKiH#Z4 zY?iLW>2wdvqpvu8G#t>(U#J907}h{@YzQ|K!Z);q6+FaI!E)Fxo#B?~mwUW;l70^2 z6MFKaDfJ~ z*hV#UiPbB6#9!Q#mrc(bSC}e$4|l&ZI{Gj(bNfce?l|qbltECwlW)Vmx^Dv8(od5( z;N)5gmM;p~PfwruIt&iOsOf4_C;Jyy8z;eS$3GJn%|W7_F0^d<6s@uBihW|UCXRky zt(v6(_(oFCcq!ee1Nh6dqTP?W%YVRDPv{T6$a~RWSr`RtC_2{^wH|*U&$F5-q%XiH zkjdVd0_dks4lj23viOq7&Haoa=O2q0_^+85q+FxCV0*k|&+ael0ukdWK`I9mN>1h+ zjSxfV@&qX?75(eHy8%vch-)By%2y48ZQBoqzN%Bx^KQ#z+tqx)%v_#>TarY29@|e@ z`ykz>c2si%gI=M+Tb;sS{$_^Y!@>FU;HD~t3&K{5pN!qqPO~@cWkra$AFAqw&~ImE z_D&v|%foWC6(!x8aM4k*%S3gMc4t_WADK7`OMsTaR`0mR@t%8aP1X;F@9eqC1n}9H zk)8~?JrVdgNx%RyFlPvjjy(*b@*r#;r?!&9l;AT1QsT)?BU90H*?Yhtd1LE(lKOG9 zekyw+^lUSnO{o4m!#SlfAIQ2BBJWb6@5`MaA)6Nypj^`|lmC50F(n5>nx} zjGBdTYG&LEXZ_!!1BDT_v@NLWx}E-Nz1TGb5DRpCJwnP z8Z83Nx-_E|ng}?dfAxxtjC$Gf*o`?Wc(nS}uTUO5Myi+(|NKnuCj8B6l_oihHx=KE z;+uESJ??hyRswO@dqGVH5K60S4NdW~xs7zJG#bJfeiGjLSs2Tzu5-ho&w*5PGjWp z#14{3Mh4+mf9ZmeWki!iXtV9)a`54PVJzs8gbM{Uz)aeXW=#XK^c+_(e191rj z34O=clogGQnGU5Kh;t5!b_#OTLR~vWzWDno3HBWpp{p@{kJXrkw!jwP(&P$3pFufC zp~BbJZHgV5DF51uYp(xPHIP?Jp>9z2oICPXju@8r?7$FCpp%(`)dD_~!2(?9WAsoO z3@rfPj*%*w6l%UABm-=|iDTmLsxg-!LxvMr>Z+?}YeyP(u)x3{SI89sOUbm_5%NtE zeAN+hU-l}vXBFZ&cahDiC!Q7^*i`xZn)_spr>bUmWHNF7vm@-K#}u(6KaX*6vst*8 zvEoTK-zoC_aX&Mwn&a&x+q}@5@8W0g#KF*%MdAx-^m$7jUMZ<1SgH70yW(Yitj%6j z_TaG|h2`DnQ|iOK%S8*qroQm6RHu+h<#RFeYR})k`QxsnFc(WmFU`Pj`H$~~-)gom zdrLHqxqRiZrOq$6{t6Sflk7w4T3aXwmkY{1;bMJQ;I`6e2-SV!%rtX*pnAA0-~0Qa z$TQ*hfFW3hz#9jjm^4J|KlV{<3wqKah(?vYB}59S?6TqNqI2Qfe8+h3LlMWYRdt#2nPucKu@P8nu_1XJJF@U{=F)>81$X_j7+TW(2qbau_Vin{6J;PkMc`-SL7rXvZe^S>ZT)1d;` z-srD#?r#c31F|2mP|~!61`th>_|t#RMf2To)8$h_%}v`Im%0ulDGDm*Nzq_e?$~!N zU61$B7B*BEv^Kl5p4D}a$H>CFpjn=CniuBFJS1OKnW)xPA3Fq7yhhTeZ;38nd?a27 zX%($tAQGQ>Qe7QY9&;(~B$(dr%zqFIDg(>=k3$+aF@1vT7}ai1#+7?*bq?s^_Ueb~ z&lBA|2dWS@zBtN*7-j_MzCR4QGh#2Jv%6No^bv542PbPM;R65!z5RI6;RM&po->Rv zJRyUj>$hYFgyISy09!LN`^-t95;#E`fOl7Cai-Ex!{|H^Mg+2DRD^4)_=LQR)lJ^) zV{YwuF$?VB3w;uN)z5_iar69{OSm%h7}c!ZETv(lDgxi`io4eR(?)4lV8x~?9ph~= z@oZ0wdur90GM72ZrD~*tyWLfo9LGbr=2lyY>4EliJCZ^6>4^F022K8w3-Bl6?+G0= z%5wLZFQKv|Xs7OGBhWe-x{F;Mvc8gu=QEDH5j5M^?lM1fjL0v%%&i#|@k4=vM^HkG6YHjYjUKzT z9BA;Iuuvc8Ux3c$o5ii;3&$EdyDwkei8Ka|Efs{nFNWU}E7TiqT!A&#?4}jwU0X8C z+HR|>-?m~Zb74EB z{xtP&`dW%hbZI7XWNoR1{t*K6Hfa|FCEJ^ivR!s literal 0 HcmV?d00001 diff --git a/tests/testfile_catalog.md b/tests/testfile_catalog.md new file mode 100644 index 0000000..692daf8 --- /dev/null +++ b/tests/testfile_catalog.md @@ -0,0 +1,12 @@ +src_invalid +=========== + +- + + + +src_valid +========= + +- Example.jpg: image/jpeg, obtained from wikipedia.org +- blah.conf: text file with a .conf extension \ No newline at end of file From 61aa14c98d56362570b85ead89fd41c10202e346 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 23 Feb 2017 16:49:29 -0500 Subject: [PATCH 11/36] Change _write_log to _print_log --- bin/filecheck.py | 15 ++++++++------- kittengroomer/helpers.py | 2 +- tests/test_filecheck.py | 4 +++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 3b1a3a4..3f9231c 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -191,9 +191,9 @@ class KittenGroomerFileCheck(KittenGroomerBase): dict_to_return[subtype] = method return dict_to_return - def _print_log(self): + def _write_log(self): """Print the logs related to the current file being processed.""" - # TODO: change name to _write_log, move to helpers + # TODO: move to helpers tmp_log = self.logger.log.fields(**self.cur_file.log_details) if self.cur_file.is_dangerous(): tmp_log.warning(self.cur_file.log_string) @@ -205,7 +205,6 @@ class KittenGroomerFileCheck(KittenGroomerBase): def _run_process(self, command_string, timeout=None): """Run command_string in a subprocess, wait until it finishes.""" args = shlex.split(command_string) - # TODO: log_debug_err and log_debug are now broken, fix, move to helpers with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout: try: subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout) @@ -560,18 +559,18 @@ class KittenGroomerFileCheck(KittenGroomerBase): else: self._safe_copy() if not self.cur_file.is_recursive: - self._print_log() + self._write_log() def processdir(self, src_dir=None, dst_dir=None): """Main function coordinating file processing.""" - # TODO: do we need defaults here? + # TODO: do we actually need defaults here? if src_dir is None: src_dir = self.src_root_dir if dst_dir is None: dst_dir = self.dst_root_dir if self.recursive_archive_depth > 0: - self._print_log() + self._write_log() if self.recursive_archive_depth >= self.max_recursive_depth: self._handle_archivebomb(src_dir) @@ -579,9 +578,11 @@ class KittenGroomerFileCheck(KittenGroomerBase): for srcpath in self._list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) relative_path = srcpath.replace(src_dir + '/', '') - # which path do we want in the log? self.process_file(srcpath, dstpath, relative_path) + def process_archive(self, src_dir, dst_dir): + pass + if __name__ == '__main__': main(KittenGroomerFileCheck, 'File sanitizer used in CIRCLean. Renames potentially dangerous files.') diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 3455bbd..a19b3f4 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -286,7 +286,7 @@ class KittenGroomerBase(object): ####################### - # TODO: feels like this funciton doesn't need to exist if we move main() + # TODO: feels like this function doesn't need to exist if we move main() def processdir(self, src_dir, dst_dir): """ Implement this function in your subclass to define file processing behavior. diff --git a/tests/test_filecheck.py b/tests/test_filecheck.py index ac7cf42..d53eded 100644 --- a/tests/test_filecheck.py +++ b/tests/test_filecheck.py @@ -45,4 +45,6 @@ class TestIntegration: class TestFileHandling: - pass + def test_autorun(self): + # Run on a single autorun file, confirm that it gets flagged as dangerous + pass From cfeccc25614089e5afbbc745462ed293c2fb07f7 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 23 Feb 2017 17:39:16 -0500 Subject: [PATCH 12/36] Move main() into filecheck - Also change the name of processdir to process_dir --- bin/filecheck.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 3f9231c..d1f6033 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -5,6 +5,7 @@ import mimetypes import shlex import subprocess import zipfile +import argparse import oletools.oleid import olefile @@ -15,7 +16,7 @@ from PIL import Image # from PIL import PngImagePlugin from pdfid import PDFiD, cPDFiD -from kittengroomer import FileBase, KittenGroomerBase, main +from kittengroomer import FileBase, KittenGroomerBase SEVENZ_PATH = '/usr/bin/7z' @@ -389,7 +390,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def _archive(self): """Processes an archive using 7zip. The archive is extracted to a - temporary directory and self.processdir is called on that directory. + temporary directory and self.process_dir is called on that directory. The recursive archive depth is increased to protect against archive bombs.""" self.cur_file.add_log_details('processing_type', 'archive') @@ -401,7 +402,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): self._run_process(extract_command) self.recursive_archive_depth += 1 self.logger.tree(tmpdir) - self.processdir(tmpdir, self.cur_file.dst_path) + self.process_dir(tmpdir, self.cur_file.dst_path) self.recursive_archive_depth -= 1 self._safe_rmtree(tmpdir) @@ -561,14 +562,8 @@ class KittenGroomerFileCheck(KittenGroomerBase): if not self.cur_file.is_recursive: self._write_log() - def processdir(self, src_dir=None, dst_dir=None): + def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" - # TODO: do we actually need defaults here? - if src_dir is None: - src_dir = self.src_root_dir - if dst_dir is None: - dst_dir = self.dst_root_dir - if self.recursive_archive_depth > 0: self._write_log() @@ -580,8 +575,17 @@ class KittenGroomerFileCheck(KittenGroomerBase): relative_path = srcpath.replace(src_dir + '/', '') self.process_file(srcpath, dstpath, relative_path) - def process_archive(self, src_dir, dst_dir): - pass + def run(self): + self.process_dir(self.src_root_dir, self.src_dst_dir) + + +def main(kg_implementation, description): + parser = argparse.ArgumentParser(prog='KittenGroomer', description=description) + parser.add_argument('-s', '--source', type=str, help='Source directory') + parser.add_argument('-d', '--destination', type=str, help='Destination directory') + args = parser.parse_args() + kg = kg_implementation(args.source, args.destination) + kg.run() if __name__ == '__main__': From bfa257534a0627d3c1bce08f0838f2658c9ae05d Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 23 Feb 2017 18:55:46 -0500 Subject: [PATCH 13/36] Remove old data/ path stuff from helpers.py --- kittengroomer/helpers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index a19b3f4..b20e6d1 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -240,9 +240,6 @@ class KittenGroomerBase(object): self.debug = debug self.cur_file = None self.logger = GroomerLog(self.dst_root_dir, debug) - # Add data/ to PATH - # self.resources_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data') - # os.environ["PATH"] += os.pathsep + self.resources_path # ##### Helpers ##### def _safe_rmtree(self, directory): From c6ecc5e3a3d1011d7d79afd62b69f9dcfd7c008b Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 23 Feb 2017 23:09:00 -0500 Subject: [PATCH 14/36] Fix process_dir bug in filecheck tests --- bin/filecheck.py | 2 +- kittengroomer/helpers.py | 2 +- tests/test_filecheck.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index d1f6033..54be51a 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -576,7 +576,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.process_file(srcpath, dstpath, relative_path) def run(self): - self.process_dir(self.src_root_dir, self.src_dst_dir) + self.process_dir(self.src_root_dir, self.dst_root_dir) def main(kg_implementation, description): diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index b20e6d1..e23d89d 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -60,7 +60,7 @@ class FileBase(object): else: try: mt = magic.from_file(self.src_path, mime=True) - # magic will always return something, even if it's just 'data' + # note: magic will always return something, even if it's just 'data' except UnicodeEncodeError as e: # FIXME: The encoding of the file is broken (possibly UTF-16) mt = '' diff --git a/tests/test_filecheck.py b/tests/test_filecheck.py index d53eded..b573a64 100644 --- a/tests/test_filecheck.py +++ b/tests/test_filecheck.py @@ -33,13 +33,13 @@ class TestIntegration: def test_filecheck(self, src_invalid, dst): groomer = KittenGroomerFileCheck(src_invalid, dst, debug=True) - groomer.processdir() + groomer.run() test_description = "filecheck_invalid" save_logs(groomer, test_description) def test_filecheck_2(self, src_valid, dst): groomer = KittenGroomerFileCheck(src_valid, dst, debug=True) - groomer.processdir() + groomer.run() test_description = "filecheck_valid" save_logs(groomer, test_description) From 4a99ba900a1266aef52a35d9c037ca7ec359a1ec Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Fri, 24 Feb 2017 10:41:59 -0500 Subject: [PATCH 15/36] Make self.get_extension return a value --- kittengroomer/helpers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index e23d89d..7e2a433 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -45,13 +45,13 @@ class FileBase(object): # TODO: rename this. file_information? file_data? file_metadata? file_log? self.log_details = {'filepath': self.src_path} self.log_string = '' - self._determine_extension() + self.extension = self._determine_extension() self._determine_mimetype() self.logger = logger def _determine_extension(self): _, ext = os.path.splitext(self.src_path) - self.extension = ext.lower() + return ext.lower() def _determine_mimetype(self): if os.path.islink(self.src_path): @@ -60,7 +60,7 @@ class FileBase(object): else: try: mt = magic.from_file(self.src_path, mime=True) - # note: magic will always return something, even if it's just 'data' + # Note: magic will always return something, even if it's just 'data' except UnicodeEncodeError as e: # FIXME: The encoding of the file is broken (possibly UTF-16) mt = '' @@ -223,8 +223,9 @@ class GroomerLog(object): return s.hexdigest() def add_file(self): + # File object will add itself + # return a sublog for the file pass - # Should this return a sublog that the file can then write to? Does that work? Too confusing to understand? def add_event(self, event, log_level): pass From 3d36c90d6612ca0ca2386b38d5073fe5ba040eb7 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Fri, 24 Feb 2017 10:43:42 -0500 Subject: [PATCH 16/36] Make list_all_files a public method --- bin/filecheck.py | 2 +- examples/generic.py | 2 +- examples/pier9.py | 2 +- examples/specific.py | 2 +- kittengroomer/helpers.py | 3 +-- tests/test_kittengroomer.py | 2 +- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 54be51a..a7e8ab5 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -570,7 +570,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): if self.recursive_archive_depth >= self.max_recursive_depth: self._handle_archivebomb(src_dir) - for srcpath in self._list_all_files(src_dir): + for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) relative_path = srcpath.replace(src_dir + '/', '') self.process_file(srcpath, dstpath, relative_path) diff --git a/examples/generic.py b/examples/generic.py index e76fccd..220b3db 100644 --- a/examples/generic.py +++ b/examples/generic.py @@ -339,7 +339,7 @@ class KittenGroomer(KittenGroomerBase): archbomb_path = src_dir[:-len('_temp')] self._safe_remove(archbomb_path) - for srcpath in self._list_all_files(src_dir): + for srcpath in self.list_all_files(src_dir): self.cur_file = File(srcpath, srcpath.replace(src_dir, dst_dir)) self.log_name.info('Processing {} ({}/{})', srcpath.replace(src_dir + '/', ''), diff --git a/examples/pier9.py b/examples/pier9.py index 6ded725..9252c4f 100644 --- a/examples/pier9.py +++ b/examples/pier9.py @@ -54,7 +54,7 @@ class KittenGroomerPier9(KittenGroomerBase): ''' Main function doing the processing ''' - for srcpath in self._list_all_files(self.src_root_dir): + for srcpath in self.list_all_files(self.src_root_dir): self.log_name.info('Processing {}', srcpath.replace(self.src_root_dir + '/', '')) self.cur_file = FilePier9(srcpath, srcpath.replace(self.src_root_dir, self.dst_root_dir)) if not self.cur_file.is_dangerous() and self.cur_file.extension in self.authorized_extensions: diff --git a/examples/specific.py b/examples/specific.py index fcca2f4..724ba35 100644 --- a/examples/specific.py +++ b/examples/specific.py @@ -54,7 +54,7 @@ class KittenGroomerSpec(KittenGroomerBase): ''' to_copy = [] error = [] - for srcpath in self._list_all_files(self.src_root_dir): + for srcpath in self.list_all_files(self.src_root_dir): valid = True self.log_name.info('Processing {}', srcpath.replace(self.src_root_dir + '/', '')) self.cur_file = FileSpec(srcpath, srcpath.replace(self.src_root_dir, self.dst_root_dir)) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 7e2a433..9f7cced 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -274,8 +274,7 @@ class KittenGroomerBase(object): print(e) return False - # TODO: this isn't a private method, change and edit the groomers as well - def _list_all_files(self, directory): + def list_all_files(self, directory): """Generator yielding path to all of the files in a directory tree.""" for root, dirs, files in os.walk(directory): for filename in files: diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index bd16c5a..8339e84 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -271,7 +271,7 @@ class TestKittenGroomerBase: testdir = tmpdir.join('testdir') os.mkdir(testdir.strpath) simple_groomer = KittenGroomerBase(tmpdir.strpath, tmpdir.strpath) - files = simple_groomer._list_all_files(simple_groomer.src_root_dir) + files = simple_groomer.list_all_files(simple_groomer.src_root_dir) assert file.strpath in files assert testdir.strpath not in files From 53c1598af8e9c351d8b59b0c3dd300f952a9f296 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Mon, 27 Feb 2017 13:22:24 -0600 Subject: [PATCH 17/36] Move file processing methods into File object - It seems like filecheck will be easier to reason about if all of the file processing stuff happens in the File object. The Groomer object will now be responsible only for enumerating the files to be processed. - Tests won't pass for this commit, but wanted to make the diff cleaner but committing this before making changes. --- bin/filecheck.py | 97 +++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index a7e8ab5..8f941f7 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -96,6 +96,38 @@ class File(FileBase): self._check_extension() self._check_mime() + subtypes_apps = [ + (Config.mimes_office, self._winoffice), + (Config.mimes_ooxml, self._ooxml), + (Config.mimes_rtf, self.text), + (Config.mimes_libreoffice, self._libreoffice), + (Config.mimes_pdf, self._pdf), + (Config.mimes_xml, self.text), + (Config.mimes_ms, self._executables), + (Config.mimes_compressed, self._archive), + (Config.mimes_data, self._binary_app), + ] + self.app_subtype_methods = self._make_method_dict(subtypes_apps) + + types_metadata = [ + (Config.mimes_exif, self._metadata_exif), + (Config.mimes_png, self._metadata_png), + ] + self.metadata_mimetype_methods = self._make_method_dict(types_metadata) + + self.mime_processing_options = { + 'text': self.text, + 'audio': self.audio, + 'image': self.image, + 'video': self.video, + 'application': self.application, + 'example': self.example, + 'message': self.message, + 'model': self.model, + 'multipart': self.multipart, + 'inode': self.inode, + } + def _check_dangerous(self): if not self.has_mimetype(): # No mimetype, should not happen. @@ -137,51 +169,8 @@ class File(FileBase): self.log_details.update({'expected_extensions': expected_extensions}) self.make_dangerous() - def has_metadata(self): - if self.mimetype in Config.mimes_metadata: - return True - return False - - -class KittenGroomerFileCheck(KittenGroomerBase): - - def __init__(self, root_src, root_dst, max_recursive_depth=2, debug=False): - super(KittenGroomerFileCheck, self).__init__(root_src, root_dst, debug) - self.recursive_archive_depth = 0 - self.max_recursive_depth = max_recursive_depth - self.log_name = self.logger.log - - subtypes_apps = [ - (Config.mimes_office, self._winoffice), - (Config.mimes_ooxml, self._ooxml), - (Config.mimes_rtf, self.text), - (Config.mimes_libreoffice, self._libreoffice), - (Config.mimes_pdf, self._pdf), - (Config.mimes_xml, self.text), - (Config.mimes_ms, self._executables), - (Config.mimes_compressed, self._archive), - (Config.mimes_data, self._binary_app), - ] - self.app_subtype_methods = self._make_method_dict(subtypes_apps) - - types_metadata = [ - (Config.mimes_exif, self._metadata_exif), - (Config.mimes_png, self._metadata_png), - ] - self.metadata_mimetype_methods = self._make_method_dict(types_metadata) - - self.mime_processing_options = { - 'text': self.text, - 'audio': self.audio, - 'image': self.image, - 'video': self.video, - 'application': self.application, - 'example': self.example, - 'message': self.message, - 'model': self.model, - 'multipart': self.multipart, - 'inode': self.inode, - } + def check(self): + pass # ##### Helper functions ##### def _make_method_dict(self, list_of_tuples): @@ -194,7 +183,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def _write_log(self): """Print the logs related to the current file being processed.""" - # TODO: move to helpers + # TODO: move to helpers? tmp_log = self.logger.log.fields(**self.cur_file.log_details) if self.cur_file.is_dangerous(): tmp_log.warning(self.cur_file.log_string) @@ -203,6 +192,11 @@ class KittenGroomerFileCheck(KittenGroomerBase): else: tmp_log.debug(self.cur_file.log_string) + def has_metadata(self): + if self.mimetype in Config.mimes_metadata: + return True + return False + def _run_process(self, command_string, timeout=None): """Run command_string in a subprocess, wait until it finishes.""" args = shlex.split(command_string) @@ -547,7 +541,14 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.cur_file.log_string += 'Image file' self.cur_file.add_log_details('processing_type', 'image') - ####################### + +class KittenGroomerFileCheck(KittenGroomerBase): + + def __init__(self, root_src, root_dst, max_recursive_depth=2, debug=False): + super(KittenGroomerFileCheck, self).__init__(root_src, root_dst, debug) + self.recursive_archive_depth = 0 + self.max_recursive_depth = max_recursive_depth + self.log_name = self.logger.log def process_file(self, srcpath, dstpath, relative_path): self.cur_file = File(srcpath, dstpath, self.logger) @@ -555,6 +556,8 @@ class KittenGroomerFileCheck(KittenGroomerBase): relative_path, self.cur_file.main_type, self.cur_file.sub_type) + self.cur_file.check() + if not self.cur_file.is_dangerous(): self.mime_processing_options.get(self.cur_file.main_type, self.unknown)() else: From 9aafe6e518ba63067db8781f9a12ccceefc9d297 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Mon, 27 Feb 2017 14:10:56 -0600 Subject: [PATCH 18/36] Remove cur_file from methods in File object --- bin/filecheck.py | 223 +++++++++++++++++++-------------------- kittengroomer/helpers.py | 2 +- 2 files changed, 112 insertions(+), 113 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 8f941f7..bda25dd 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -170,7 +170,8 @@ class File(FileBase): self.make_dangerous() def check(self): - pass + if not self.is_dangerous(): + self.mime_processing_options.get(self.main_type, self.unknown)() # ##### Helper functions ##### def _make_method_dict(self, list_of_tuples): @@ -184,14 +185,15 @@ class File(FileBase): def _write_log(self): """Print the logs related to the current file being processed.""" # TODO: move to helpers? - tmp_log = self.logger.log.fields(**self.cur_file.log_details) - if self.cur_file.is_dangerous(): - tmp_log.warning(self.cur_file.log_string) - elif self.cur_file.log_details.get('unknown') or self.cur_file.log_details.get('binary'): - tmp_log.info(self.cur_file.log_string) + tmp_log = self.logger.log.fields(**self.log_details) + if self.is_dangerous(): + tmp_log.warning(self.log_string) + elif self.log_details.get('unknown') or self.log_details.get('binary'): + tmp_log.info(self.log_string) else: - tmp_log.debug(self.cur_file.log_string) + tmp_log.debug(self.log_string) + # Make this an @property def has_metadata(self): if self.mimetype in Config.mimes_metadata: return True @@ -211,198 +213,197 @@ class File(FileBase): # ##### Discarded mimetypes, reason in the docstring ###### def inode(self): """Empty file or symlink.""" - if self.cur_file.is_symlink(): - self.cur_file.log_string += 'Symlink to {}'.format(self.cur_file.log_details['symlink']) + if self.is_symlink(): + self.log_string += 'Symlink to {}'.format(self.log_details['symlink']) else: - self.cur_file.log_string += 'Inode file' + self.log_string += 'Inode file' def unknown(self): """Main type should never be unknown.""" - self.cur_file.log_string += 'Unknown file' + self.log_string += 'Unknown file' def example(self): """Used in examples, should never be returned by libmagic.""" - self.cur_file.log_string += 'Example file' + self.log_string += 'Example file' def multipart(self): """Used in web apps, should never be returned by libmagic""" - self.cur_file.log_string += 'Multipart file' + self.log_string += 'Multipart file' # ##### Treated as malicious, no reason to have it on a USB key ###### def message(self): """Process a message file.""" - self.cur_file.log_string += 'Message file' - self.cur_file.make_dangerous() + self.log_string += 'Message file' + self.make_dangerous() self._safe_copy() def model(self): """Process a model file.""" - self.cur_file.log_string += 'Model file' - self.cur_file.make_dangerous() + self.log_string += 'Model file' + self.make_dangerous() self._safe_copy() # ##### Files that will be converted ###### def text(self): """Process an rtf, ooxml, or plaintext file.""" for mt in Config.mimes_rtf: - if mt in self.cur_file.sub_type: - self.cur_file.log_string += 'Rich Text file' + if mt in self.sub_type: + self.log_string += 'Rich Text file' # TODO: need a way to convert it to plain text - self.cur_file.force_ext('.txt') + self.force_ext('.txt') self._safe_copy() return for mt in Config.mimes_ooxml: - if mt in self.cur_file.sub_type: - self.cur_file.log_string += 'OOXML File' + if mt in self.sub_type: + self.log_string += 'OOXML File' self._ooxml() return - self.cur_file.log_string += 'Text file' - self.cur_file.force_ext('.txt') + self.log_string += 'Text file' + self.force_ext('.txt') self._safe_copy() def application(self): """Processes an application specific file according to its subtype.""" for subtype, method in self.app_subtype_methods.items(): - if subtype in self.cur_file.sub_type: + if subtype in self.sub_type: method() - self.cur_file.log_string += 'Application file' + self.log_string += 'Application file' return - self.cur_file.log_string += 'Unknown Application file' + self.log_string += 'Unknown Application file' self._unknown_app() def _executables(self): """Processes an executable file.""" - self.cur_file.add_log_details('processing_type', 'executable') - self.cur_file.make_dangerous() + self.add_log_details('processing_type', 'executable') + self.make_dangerous() self._safe_copy() def _winoffice(self): """Processes a winoffice file using olefile/oletools.""" - self.cur_file.add_log_details('processing_type', 'WinOffice') + self.add_log_details('processing_type', 'WinOffice') # Try as if it is a valid document - oid = oletools.oleid.OleID(self.cur_file.src_path) - if not olefile.isOleFile(self.cur_file.src_path): + oid = oletools.oleid.OleID(self.src_path) + if not olefile.isOleFile(self.src_path): # Manual processing, may already count as suspicious try: - ole = olefile.OleFileIO(self.cur_file.src_path, raise_defects=olefile.DEFECT_INCORRECT) + ole = olefile.OleFileIO(self.src_path, raise_defects=olefile.DEFECT_INCORRECT) except: - self.cur_file.add_log_details('not_parsable', True) - self.cur_file.make_dangerous() + self.add_log_details('not_parsable', True) + self.make_dangerous() if ole.parsing_issues: - self.cur_file.add_log_details('parsing_issues', True) - self.cur_file.make_dangerous() + self.add_log_details('parsing_issues', True) + self.make_dangerous() else: if ole.exists('macros/vba') or ole.exists('Macros') \ or ole.exists('_VBA_PROJECT_CUR') or ole.exists('VBA'): - self.cur_file.add_log_details('macro', True) - self.cur_file.make_dangerous() + self.add_log_details('macro', True) + self.make_dangerous() else: indicators = oid.check() # Encrypted ban be set by multiple checks on the script if oid.encrypted.value: - self.cur_file.add_log_details('encrypted', True) - self.cur_file.make_dangerous() + self.add_log_details('encrypted', True) + self.make_dangerous() if oid.macros.value or oid.ole.exists('macros/vba') or oid.ole.exists('Macros') \ or oid.ole.exists('_VBA_PROJECT_CUR') or oid.ole.exists('VBA'): - self.cur_file.add_log_details('macro', True) - self.cur_file.make_dangerous() + self.add_log_details('macro', True) + self.make_dangerous() for i in indicators: if i.id == 'ObjectPool' and i.value: # FIXME: Is it suspicious? - self.cur_file.add_log_details('objpool', True) + self.add_log_details('objpool', True) elif i.id == 'flash' and i.value: - self.cur_file.add_log_details('flash', True) - self.cur_file.make_dangerous() + self.add_log_details('flash', True) + self.make_dangerous() self._safe_copy() def _ooxml(self): """Processes an ooxml file.""" - self.cur_file.add_log_details('processing_type', 'ooxml') + self.add_log_details('processing_type', 'ooxml') try: - doc = officedissector.doc.Document(self.cur_file.src_path) + doc = officedissector.doc.Document(self.src_path) except Exception: # Invalid file - self.cur_file.make_dangerous() + self.make_dangerous() self._safe_copy() return # There are probably other potentially malicious features: # fonts, custom props, custom XML if doc.is_macro_enabled or len(doc.features.macros) > 0: - self.cur_file.add_log_details('macro', True) - self.cur_file.make_dangerous() + self.add_log_details('macro', True) + self.make_dangerous() if len(doc.features.embedded_controls) > 0: - self.cur_file.add_log_details('activex', True) - self.cur_file.make_dangerous() + self.add_log_details('activex', True) + self.make_dangerous() if len(doc.features.embedded_objects) > 0: # Exploited by CVE-2014-4114 (OLE) - self.cur_file.add_log_details('embedded_obj', True) - self.cur_file.make_dangerous() + self.add_log_details('embedded_obj', True) + self.make_dangerous() if len(doc.features.embedded_packages) > 0: - self.cur_file.add_log_details('embedded_pack', True) - self.cur_file.make_dangerous() + self.add_log_details('embedded_pack', True) + self.make_dangerous() self._safe_copy() def _libreoffice(self): """Processes a libreoffice file.""" - self.cur_file.add_log_details('processing_type', 'libreoffice') + self.add_log_details('processing_type', 'libreoffice') # As long as there ar no way to do a sanity check on the files => dangerous try: - lodoc = zipfile.ZipFile(self.cur_file.src_path, 'r') + lodoc = zipfile.ZipFile(self.src_path, 'r') except: - self.cur_file.add_log_details('invalid', True) - self.cur_file.make_dangerous() + self.add_log_details('invalid', True) + self.make_dangerous() for f in lodoc.infolist(): fname = f.filename.lower() if fname.startswith('script') or fname.startswith('basic') or \ fname.startswith('object') or fname.endswith('.bin'): - self.cur_file.add_log_details('macro', True) - self.cur_file.make_dangerous() + self.add_log_details('macro', True) + self.make_dangerous() self._safe_copy() def _pdf(self): """Processes a PDF file.""" - self.cur_file.add_log_details('processing_type', 'pdf') - xmlDoc = PDFiD(self.cur_file.src_path) + self.add_log_details('processing_type', 'pdf') + xmlDoc = PDFiD(self.src_path) oPDFiD = cPDFiD(xmlDoc, True) # TODO: other keywords? if oPDFiD.encrypt.count > 0: - self.cur_file.add_log_details('encrypted', True) - self.cur_file.make_dangerous() + self.add_log_details('encrypted', True) + self.make_dangerous() if oPDFiD.js.count > 0 or oPDFiD.javascript.count > 0: - self.cur_file.add_log_details('javascript', True) - self.cur_file.make_dangerous() + self.add_log_details('javascript', True) + self.make_dangerous() if oPDFiD.aa.count > 0 or oPDFiD.openaction.count > 0: - self.cur_file.add_log_details('openaction', True) - self.cur_file.make_dangerous() + self.add_log_details('openaction', True) + self.make_dangerous() if oPDFiD.richmedia.count > 0: - self.cur_file.add_log_details('flash', True) - self.cur_file.make_dangerous() + self.add_log_details('flash', True) + self.make_dangerous() if oPDFiD.launch.count > 0: - self.cur_file.add_log_details('launch', True) - self.cur_file.make_dangerous() - self._safe_copy() + self.add_log_details('launch', True) + self.make_dangerous() def _archive(self): """Processes an archive using 7zip. The archive is extracted to a temporary directory and self.process_dir is called on that directory. The recursive archive depth is increased to protect against archive bombs.""" - self.cur_file.add_log_details('processing_type', 'archive') - self.cur_file.is_recursive = True - self.cur_file.log_string += 'Archive extracted, processing content.' - tmpdir = self.cur_file.dst_path + '_temp' + self.add_log_details('processing_type', 'archive') + self.is_recursive = True + self.log_string += 'Archive extracted, processing content.' + tmpdir = self.dst_path + '_temp' self._safe_mkdir(tmpdir) - extract_command = '{} -p1 x "{}" -o"{}" -bd -aoa'.format(SEVENZ_PATH, self.cur_file.src_path, tmpdir) + extract_command = '{} -p1 x "{}" -o"{}" -bd -aoa'.format(SEVENZ_PATH, self.src_path, tmpdir) self._run_process(extract_command) self.recursive_archive_depth += 1 self.logger.tree(tmpdir) - self.process_dir(tmpdir, self.cur_file.dst_path) + self.process_dir(tmpdir, self.dst_path) self.recursive_archive_depth -= 1 self._safe_rmtree(tmpdir) def _handle_archivebomb(self, src_dir): - self.cur_file.make_dangerous() - self.cur_file.add_log_details('Archive Bomb', True) + self.make_dangerous() + self.add_log_details('Archive Bomb', True) self.log_name.warning('ARCHIVE BOMB.') self.log_name.warning('The content of the archive contains recursively other archives.') self.log_name.warning('This is a bad sign so the archive is not extracted to the destination key.') @@ -413,30 +414,30 @@ class File(FileBase): def _unknown_app(self): """Processes an unknown file.""" - self.cur_file.make_unknown() + self.make_unknown() self._safe_copy() def _binary_app(self): """Processses an unknown binary file.""" - self.cur_file.make_binary() + self.make_binary() self._safe_copy() ####################### # Metadata extractors def _metadata_exif(self, metadata_file_path): - img = open(self.cur_file.src_path, 'rb') + img = open(self.src_path, 'rb') tags = None try: tags = exifread.process_file(img, debug=True) except Exception as e: - print("Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.cur_file.src_path)) + print("Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path)) print(e) if tags is None: try: tags = exifread.process_file(img, debug=True) except Exception as e: - print("Failed to get any metadata for file {}.".format(self.cur_file.src_path)) + print("Failed to get any metadata for file {}.".format(self.src_path)) print(e) img.close() return False @@ -456,32 +457,32 @@ class File(FileBase): with open(metadata_file_path, 'w+') as metadata_file: metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable)) - self.cur_file.add_log_details('metadata', 'exif') + self.add_log_details('metadata', 'exif') img.close() return True def _metadata_png(self, metadata_file_path): warnings.simplefilter('error', Image.DecompressionBombWarning) try: - img = Image.open(self.cur_file.src_path) + img = Image.open(self.src_path) for tag in sorted(img.info.keys()): # These are long and obnoxious/binary if tag not in ('icc_profile'): with open(metadata_file_path, 'w+') as metadata_file: metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag])) - self.cur_file.add_log_details('metadata', 'png') + self.add_log_details('metadata', 'png') img.close() # Catch decompression bombs except Exception as e: - print("Caught exception processing metadata for {}".format(self.cur_file.src_path)) + print("Caught exception processing metadata for {}".format(self.src_path)) print(e) - self.cur_file.make_dangerous() + self.make_dangerous() self._safe_copy() return False def extract_metadata(self): - metadata_file_path = self.cur_file.create_metadata_file(".metadata.txt") - mt = self.cur_file.mimetype + metadata_file_path = self.create_metadata_file(".metadata.txt") + mt = self.mimetype metadata_processing_method = self.metadata_mimetype_methods.get(mt) if metadata_processing_method: # TODO: should we return metadata and write it here instead of in processing method? @@ -491,17 +492,17 @@ class File(FileBase): # ##### Media - audio and video aren't converted ###### def audio(self): """Processes an audio file.""" - self.cur_file.log_string += 'Audio file' + self.log_string += 'Audio file' self._media_processing() def video(self): """Processes a video.""" - self.cur_file.log_string += 'Video file' + self.log_string += 'Video file' self._media_processing() def _media_processing(self): """Generic way to process all media files.""" - self.cur_file.add_log_details('processing_type', 'media') + self.add_log_details('processing_type', 'media') self._safe_copy() def image(self): @@ -510,12 +511,12 @@ class File(FileBase): Extracts metadata if metadata is present. Creates a temporary directory, opens the using PIL.Image, saves it to the temporary directory, and copies it to the destination.""" - if self.cur_file.has_metadata(): + if self.has_metadata(): self.extract_metadata() # FIXME make sure this works for png, gif, tiff # Create a temp directory - dst_dir, filename = os.path.split(self.cur_file.dst_path) + dst_dir, filename = os.path.split(self.dst_path) tmpdir = os.path.join(dst_dir, 'temp') tmppath = os.path.join(tmpdir, filename) self._safe_mkdir(tmpdir) @@ -523,7 +524,7 @@ class File(FileBase): # Do our image conversions warnings.simplefilter('error', Image.DecompressionBombWarning) try: - imIn = Image.open(self.cur_file.src_path) + imIn = Image.open(self.src_path) imOut = Image.frombytes(imIn.mode, imIn.size, imIn.tobytes()) imOut.save(tmppath) @@ -533,13 +534,13 @@ class File(FileBase): # Catch decompression bombs except Exception as e: - print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.cur_file.src_path)) + print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) print(e) - self.cur_file.make_dangerous() + self.make_dangerous() self._safe_copy() - self.cur_file.log_string += 'Image file' - self.cur_file.add_log_details('processing_type', 'image') + self.log_string += 'Image file' + self.add_log_details('processing_type', 'image') class KittenGroomerFileCheck(KittenGroomerBase): @@ -557,22 +558,20 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.cur_file.main_type, self.cur_file.sub_type) self.cur_file.check() - - if not self.cur_file.is_dangerous(): - self.mime_processing_options.get(self.cur_file.main_type, self.unknown)() + if self.cur_file.is_archive: + # Handle archive + pass else: + # TODO: Check if should be copied, maybe have an attribute for this? self._safe_copy() - if not self.cur_file.is_recursive: self._write_log() def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" if self.recursive_archive_depth > 0: self._write_log() - if self.recursive_archive_depth >= self.max_recursive_depth: self._handle_archivebomb(src_dir) - for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) relative_path = srcpath.replace(src_dir + '/', '') diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 9f7cced..78d9dcd 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -42,7 +42,7 @@ class FileBase(object): """Initialized with the source path and expected destination path.""" self.src_path = src_path self.dst_path = dst_path - # TODO: rename this. file_information? file_data? file_metadata? file_log? + # TODO: rename this to file_properties self.log_details = {'filepath': self.src_path} self.log_string = '' self.extension = self._determine_extension() From 781d0a76af061d0dc5b5863f45a02b3d64bc9a98 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 1 Mar 2017 15:24:48 -0500 Subject: [PATCH 19/36] First working version with methods in File object - All tests now passing with file handling methods on File object instead of Groomer object. - Logging functionality still isn't finished. --- bin/filecheck.py | 193 +++++++++++++++++++-------------------- kittengroomer/helpers.py | 3 +- 2 files changed, 98 insertions(+), 98 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index bda25dd..6afbd60 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -86,15 +86,9 @@ class File(FileBase): def __init__(self, src_path, dst_path, logger): super(File, self).__init__(src_path, dst_path, logger) self.is_recursive = False - self._check_dangerous() - if self.is_dangerous(): - return - self.log_details.update({'maintype': self.main_type, 'subtype': self.sub_type, 'extension': self.extension}) - self._check_extension() - self._check_mime() subtypes_apps = [ (Config.mimes_office, self._winoffice), @@ -129,8 +123,7 @@ class File(FileBase): } def _check_dangerous(self): - if not self.has_mimetype(): - # No mimetype, should not happen. + if not self.has_mimetype(): # No mimetype, should not happen. self.make_dangerous() if not self.has_extension(): self.make_dangerous() @@ -147,7 +140,8 @@ class File(FileBase): if self.extension in Config.override_ext: expected_mimetype = Config.override_ext[self.extension] else: - expected_mimetype, encoding = mimetypes.guess_type(self.src_path, strict=False) + expected_mimetype, encoding = mimetypes.guess_type(self.src_path, + strict=False) if expected_mimetype in Config.aliases: expected_mimetype = Config.aliases[expected_mimetype] is_known_extension = self.extension in mimetypes.types_map.keys() @@ -155,7 +149,7 @@ class File(FileBase): self.log_details.update({'expected_mimetype': expected_mimetype}) self.make_dangerous() - def _check_mime(self): + def _check_mimetype(self): """Takes the mimetype (as determined by libmagic) and determines whether the list of extensions that are normally associated with that extension contains the file's actual extension.""" @@ -163,13 +157,17 @@ class File(FileBase): mimetype = Config.aliases[self.mimetype] else: mimetype = self.mimetype - expected_extensions = mimetypes.guess_all_extensions(mimetype, strict=False) + expected_extensions = mimetypes.guess_all_extensions(mimetype, + strict=False) if expected_extensions: - if len(self.extension) > 0 and self.extension not in expected_extensions: + if self.has_extension() and self.extension not in expected_extensions: self.log_details.update({'expected_extensions': expected_extensions}) self.make_dangerous() def check(self): + self._check_dangerous() + self._check_extension() + self._check_mimetype() if not self.is_dangerous(): self.mime_processing_options.get(self.main_type, self.unknown)() @@ -182,7 +180,7 @@ class File(FileBase): dict_to_return[subtype] = method return dict_to_return - def _write_log(self): + def write_log(self): """Print the logs related to the current file being processed.""" # TODO: move to helpers? tmp_log = self.logger.log.fields(**self.log_details) @@ -209,6 +207,13 @@ class File(FileBase): return return True + def _make_tempdir(self): + """Make a temporary directory.""" + self.tempdir_path = self.dst_path + '_temp' + if not os.path.exists(self.tempdir_path): + os.makedirs(self.tempdir_path) + return self.tempdir_path + ####################### # ##### Discarded mimetypes, reason in the docstring ###### def inode(self): @@ -235,13 +240,11 @@ class File(FileBase): """Process a message file.""" self.log_string += 'Message file' self.make_dangerous() - self._safe_copy() def model(self): """Process a model file.""" self.log_string += 'Model file' self.make_dangerous() - self._safe_copy() # ##### Files that will be converted ###### def text(self): @@ -251,7 +254,6 @@ class File(FileBase): self.log_string += 'Rich Text file' # TODO: need a way to convert it to plain text self.force_ext('.txt') - self._safe_copy() return for mt in Config.mimes_ooxml: if mt in self.sub_type: @@ -260,7 +262,6 @@ class File(FileBase): return self.log_string += 'Text file' self.force_ext('.txt') - self._safe_copy() def application(self): """Processes an application specific file according to its subtype.""" @@ -276,7 +277,6 @@ class File(FileBase): """Processes an executable file.""" self.add_log_details('processing_type', 'executable') self.make_dangerous() - self._safe_copy() def _winoffice(self): """Processes a winoffice file using olefile/oletools.""" @@ -315,7 +315,6 @@ class File(FileBase): elif i.id == 'flash' and i.value: self.add_log_details('flash', True) self.make_dangerous() - self._safe_copy() def _ooxml(self): """Processes an ooxml file.""" @@ -325,7 +324,6 @@ class File(FileBase): except Exception: # Invalid file self.make_dangerous() - self._safe_copy() return # There are probably other potentially malicious features: # fonts, custom props, custom XML @@ -342,7 +340,6 @@ class File(FileBase): if len(doc.features.embedded_packages) > 0: self.add_log_details('embedded_pack', True) self.make_dangerous() - self._safe_copy() def _libreoffice(self): """Processes a libreoffice file.""" @@ -359,7 +356,6 @@ class File(FileBase): fname.startswith('object') or fname.endswith('.bin'): self.add_log_details('macro', True) self.make_dangerous() - self._safe_copy() def _pdf(self): """Processes a PDF file.""" @@ -390,37 +386,15 @@ class File(FileBase): bombs.""" self.add_log_details('processing_type', 'archive') self.is_recursive = True - self.log_string += 'Archive extracted, processing content.' - tmpdir = self.dst_path + '_temp' - self._safe_mkdir(tmpdir) - extract_command = '{} -p1 x "{}" -o"{}" -bd -aoa'.format(SEVENZ_PATH, self.src_path, tmpdir) - self._run_process(extract_command) - self.recursive_archive_depth += 1 - self.logger.tree(tmpdir) - self.process_dir(tmpdir, self.dst_path) - self.recursive_archive_depth -= 1 - self._safe_rmtree(tmpdir) - - def _handle_archivebomb(self, src_dir): - self.make_dangerous() - self.add_log_details('Archive Bomb', True) - self.log_name.warning('ARCHIVE BOMB.') - self.log_name.warning('The content of the archive contains recursively other archives.') - self.log_name.warning('This is a bad sign so the archive is not extracted to the destination key.') - self._safe_rmtree(src_dir) - if src_dir.endswith('_temp'): - bomb_path = src_dir[:-len('_temp')] - self._safe_remove(bomb_path) + # self.log_string += 'Archive extracted, processing content.' def _unknown_app(self): """Processes an unknown file.""" self.make_unknown() - self._safe_copy() def _binary_app(self): """Processses an unknown binary file.""" self.make_binary() - self._safe_copy() ####################### # Metadata extractors @@ -431,12 +405,14 @@ class File(FileBase): try: tags = exifread.process_file(img, debug=True) except Exception as e: + # TODO: log instead of print print("Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path)) print(e) if tags is None: try: tags = exifread.process_file(img, debug=True) except Exception as e: + # TODO: log instead of print print("Failed to get any metadata for file {}.".format(self.src_path)) print(e) img.close() @@ -450,10 +426,7 @@ class File(FileBase): # Exifreader truncates data. if len(printable) > 25 and printable.endswith(", ... ]"): value = tags[tag].values - if isinstance(value, str): - printable = value - else: - printable = str(value) + printable = str(value) with open(metadata_file_path, 'w+') as metadata_file: metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable)) @@ -474,10 +447,10 @@ class File(FileBase): img.close() # Catch decompression bombs except Exception as e: + # TODO: log instead of print print("Caught exception processing metadata for {}".format(self.src_path)) print(e) self.make_dangerous() - self._safe_copy() return False def extract_metadata(self): @@ -503,42 +476,29 @@ class File(FileBase): def _media_processing(self): """Generic way to process all media files.""" self.add_log_details('processing_type', 'media') - self._safe_copy() def image(self): """Processes an image. - Extracts metadata if metadata is present. Creates a temporary - directory, opens the using PIL.Image, saves it to the temporary - directory, and copies it to the destination.""" + Extracts metadata to dest key if metadata is present. Creates a + temporary directory on dest key, opens the using PIL.Image,saves it to + the temporary directory, and copies it to the destination.""" + # TODO: make sure this method works for png, gif, tiff if self.has_metadata(): self.extract_metadata() - - # FIXME make sure this works for png, gif, tiff - # Create a temp directory - dst_dir, filename = os.path.split(self.dst_path) - tmpdir = os.path.join(dst_dir, 'temp') - tmppath = os.path.join(tmpdir, filename) - self._safe_mkdir(tmpdir) - - # Do our image conversions + tempdir_path = self._make_tempdir() + tempfile_path = os.path.join(tempdir_path, self.filename) warnings.simplefilter('error', Image.DecompressionBombWarning) - try: - imIn = Image.open(self.src_path) - imOut = Image.frombytes(imIn.mode, imIn.size, imIn.tobytes()) - imOut.save(tmppath) - - # Copy the file back out and cleanup - self._safe_copy(tmppath) - self._safe_rmtree(tmpdir) - - # Catch decompression bombs - except Exception as e: + try: # Do image conversions + img_in = Image.open(self.src_path) + img_out = Image.frombytes(img_in.mode, img_in.size, img_in.tobytes()) + img_out.save(tempfile_path) + self.src_path = tempfile_path + except Exception as e: # Catch decompression bombs + # TODO: change this from printing to logging print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) print(e) self.make_dangerous() - self._safe_copy() - self.log_string += 'Image file' self.add_log_details('processing_type', 'image') @@ -549,33 +509,72 @@ class KittenGroomerFileCheck(KittenGroomerBase): super(KittenGroomerFileCheck, self).__init__(root_src, root_dst, debug) self.recursive_archive_depth = 0 self.max_recursive_depth = max_recursive_depth - self.log_name = self.logger.log - - def process_file(self, srcpath, dstpath, relative_path): - self.cur_file = File(srcpath, dstpath, self.logger) - self.log_name.info('Processing {} ({}/{})', - relative_path, - self.cur_file.main_type, - self.cur_file.sub_type) - self.cur_file.check() - if self.cur_file.is_archive: - # Handle archive - pass - else: - # TODO: Check if should be copied, maybe have an attribute for this? - self._safe_copy() - self._write_log() def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" - if self.recursive_archive_depth > 0: - self._write_log() - if self.recursive_archive_depth >= self.max_recursive_depth: - self._handle_archivebomb(src_dir) + # TODO: Think we want to move write_log elsewhere: + # if self.recursive_archive_depth > 0: + # self.write_log() + # TODO: Can we clean up the way we handle relative_path? for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) relative_path = srcpath.replace(src_dir + '/', '') - self.process_file(srcpath, dstpath, relative_path) + self.cur_file = File(srcpath, dstpath, self.logger) + # TODO: move this logging code elsewhere + self.logger.log.info('Processing {} ({}/{})', + relative_path, + self.cur_file.main_type, + self.cur_file.sub_type) + self.process_file(self.cur_file) + + def process_file(self, file): + file.check() + if file.is_recursive: + self.process_archive(file) + else: + # TODO: Check if should be copied, make an attribute for should be copied True/False + self._safe_copy() + file.write_log() + if hasattr(file, "tempdir_path"): + self._safe_rmtree(file.tempdir_path) + + def process_archive(self, file): + """Unpacks an archive using 7zip and processes contents. + + Should be given a Kittengroomer file object whose src_path points + to an archive.""" + self.recursive_archive_depth += 1 + # Check for archivebomb + if self.recursive_archive_depth >= self.max_recursive_depth: + self._handle_archivebomb(file) + else: + tempdir_path = file._make_tempdir() + # Unpack the archive + base_command = '{} -p1 x "{}" -o"{}" -bd -aoa' + extract_command = base_command.format(SEVENZ_PATH, file.src_path, tempdir_path) + file._run_process(extract_command) + # Add it to the tree + self.logger.tree(tempdir_path) + # List all files, process them + self.process_dir(tempdir_path, file.dst_path) + # Clean up + self._safe_rmtree(tempdir_path) + self.recursive_archive_depth -= 1 + + + def _handle_archivebomb(self, file): + file.make_dangerous() + file.add_log_details('Archive Bomb', True) + self.logger.log.warning('ARCHIVE BOMB.') + self.logger.log.warning('The content of the archive contains recursively other archives.') + self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.') + # TODO: are we sure we want to delete the archive on the source key? Commenting out for now + # self._safe_rmtree(file.src_dir) + # What is the goal of this code: + # if file.src_dir.endswith('_temp'): + # # TODO: change the way bomb_path is constructed and the way we check for tempdir + # bomb_path = file.src_dir[:-len('_temp')] + # self._safe_remove(bomb_path) def run(self): self.process_dir(self.src_root_dir, self.dst_root_dir) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 78d9dcd..2efde64 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -42,12 +42,13 @@ class FileBase(object): """Initialized with the source path and expected destination path.""" self.src_path = src_path self.dst_path = dst_path - # TODO: rename this to file_properties + # TODO: rename this to file_properties (and change in other groomers) self.log_details = {'filepath': self.src_path} self.log_string = '' self.extension = self._determine_extension() self._determine_mimetype() self.logger = logger + self.filename = os.path.basename(self.src_path) def _determine_extension(self): _, ext = os.path.splitext(self.src_path) From 8d7dd1197f6d59160848bb6c7e38df3187e87aa0 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 1 Mar 2017 17:43:43 -0500 Subject: [PATCH 20/36] Move run_process back to Groomer object --- .travis.yml | 4 ++-- bin/filecheck.py | 60 ++++++++++++++++++++++-------------------------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/.travis.yml b/.travis.yml index b778bf8..786eeab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -66,8 +66,8 @@ install: - rm fraunhoferlibrary.zip - 7z x -p42 42.zip # Some random samples - - wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3 - - wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4 + # - wget http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3 + # - wget http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4 - wget http://thewalter.net/stef/software/rtfx/sample.rtf - popd diff --git a/bin/filecheck.py b/bin/filecheck.py index 6afbd60..715aa44 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -182,7 +182,7 @@ class File(FileBase): def write_log(self): """Print the logs related to the current file being processed.""" - # TODO: move to helpers? + # TODO: move to helpers.py? tmp_log = self.logger.log.fields(**self.log_details) if self.is_dangerous(): tmp_log.warning(self.log_string) @@ -191,23 +191,13 @@ class File(FileBase): else: tmp_log.debug(self.log_string) - # Make this an @property + # TODO: Make this an @property def has_metadata(self): if self.mimetype in Config.mimes_metadata: return True return False - def _run_process(self, command_string, timeout=None): - """Run command_string in a subprocess, wait until it finishes.""" - args = shlex.split(command_string) - with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout: - try: - subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout) - except (subprocess.TimeoutExpired, subprocess.CalledProcessError): - return - return True - - def _make_tempdir(self): + def make_tempdir(self): """Make a temporary directory.""" self.tempdir_path = self.dst_path + '_temp' if not os.path.exists(self.tempdir_path): @@ -348,6 +338,7 @@ class File(FileBase): try: lodoc = zipfile.ZipFile(self.src_path, 'r') except: + # TODO: are there specific exceptions we should catch here? Or is anything ok self.add_log_details('invalid', True) self.make_dangerous() for f in lodoc.infolist(): @@ -362,7 +353,7 @@ class File(FileBase): self.add_log_details('processing_type', 'pdf') xmlDoc = PDFiD(self.src_path) oPDFiD = cPDFiD(xmlDoc, True) - # TODO: other keywords? + # TODO: are there other characteristics which should be dangerous? if oPDFiD.encrypt.count > 0: self.add_log_details('encrypted', True) self.make_dangerous() @@ -399,6 +390,7 @@ class File(FileBase): ####################### # Metadata extractors def _metadata_exif(self, metadata_file_path): + # TODO: this method is kind of long, can we shorten it? img = open(self.src_path, 'rb') tags = None @@ -486,7 +478,7 @@ class File(FileBase): # TODO: make sure this method works for png, gif, tiff if self.has_metadata(): self.extract_metadata() - tempdir_path = self._make_tempdir() + tempdir_path = self.make_tempdir() tempfile_path = os.path.join(tempdir_path, self.filename) warnings.simplefilter('error', Image.DecompressionBombWarning) try: # Do image conversions @@ -495,6 +487,7 @@ class File(FileBase): img_out.save(tempfile_path) self.src_path = tempfile_path except Exception as e: # Catch decompression bombs + # TODO: change this from all Exceptions to specific DecompressionBombWarning # TODO: change this from printing to logging print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) print(e) @@ -512,7 +505,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" - # TODO: Think we want to move write_log elsewhere: + # TODO: we probably want to move this write_log elsewhere: # if self.recursive_archive_depth > 0: # self.write_log() # TODO: Can we clean up the way we handle relative_path? @@ -532,7 +525,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): if file.is_recursive: self.process_archive(file) else: - # TODO: Check if should be copied, make an attribute for should be copied True/False + # TODO: Make a File attribute for should be copied True/False and check it self._safe_copy() file.write_log() if hasattr(file, "tempdir_path"): @@ -548,33 +541,34 @@ class KittenGroomerFileCheck(KittenGroomerBase): if self.recursive_archive_depth >= self.max_recursive_depth: self._handle_archivebomb(file) else: - tempdir_path = file._make_tempdir() - # Unpack the archive - base_command = '{} -p1 x "{}" -o"{}" -bd -aoa' - extract_command = base_command.format(SEVENZ_PATH, file.src_path, tempdir_path) - file._run_process(extract_command) - # Add it to the tree + tempdir_path = file.make_tempdir() + command_str = '{} -p1 x "{}" -o"{}" -bd -aoa' + unpack_command = command_str.format(SEVENZ_PATH, + file.src_path, tempdir_path) + self._run_process(unpack_command) + # TODO: check that tree is working correctly here self.logger.tree(tempdir_path) - # List all files, process them self.process_dir(tempdir_path, file.dst_path) - # Clean up self._safe_rmtree(tempdir_path) self.recursive_archive_depth -= 1 - def _handle_archivebomb(self, file): file.make_dangerous() file.add_log_details('Archive Bomb', True) self.logger.log.warning('ARCHIVE BOMB.') self.logger.log.warning('The content of the archive contains recursively other archives.') self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.') - # TODO: are we sure we want to delete the archive on the source key? Commenting out for now - # self._safe_rmtree(file.src_dir) - # What is the goal of this code: - # if file.src_dir.endswith('_temp'): - # # TODO: change the way bomb_path is constructed and the way we check for tempdir - # bomb_path = file.src_dir[:-len('_temp')] - # self._safe_remove(bomb_path) + # TODO: delete whatever we want to delete that's already been copied to dest dir + + def _run_process(self, command_string, timeout=None): + """Run command_string in a subprocess, wait until it finishes.""" + args = shlex.split(command_string) + with open(self.logger.log_debug_err, 'ab') as stderr, open(self.logger.log_debug_out, 'ab') as stdout: + try: + subprocess.check_call(args, stdout=stdout, stderr=stderr, timeout=timeout) + except (subprocess.TimeoutExpired, subprocess.CalledProcessError): + return + return True def run(self): self.process_dir(self.src_root_dir, self.dst_root_dir) From 9832101c85bda9e96330b768c16d9ef00331b562 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Mon, 27 Feb 2017 10:58:10 -0600 Subject: [PATCH 21/36] Identify TODOs that are log related --- bin/filecheck.py | 20 ++++++++++---------- kittengroomer/helpers.py | 14 +++++++------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 715aa44..ccf0b0e 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -182,7 +182,7 @@ class File(FileBase): def write_log(self): """Print the logs related to the current file being processed.""" - # TODO: move to helpers.py? + # LOG: move to helpers.py? tmp_log = self.logger.log.fields(**self.log_details) if self.is_dangerous(): tmp_log.warning(self.log_string) @@ -191,7 +191,7 @@ class File(FileBase): else: tmp_log.debug(self.log_string) - # TODO: Make this an @property + @property def has_metadata(self): if self.mimetype in Config.mimes_metadata: return True @@ -397,14 +397,14 @@ class File(FileBase): try: tags = exifread.process_file(img, debug=True) except Exception as e: - # TODO: log instead of print + # LOG: log instead of print print("Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path)) print(e) if tags is None: try: tags = exifread.process_file(img, debug=True) except Exception as e: - # TODO: log instead of print + # LOG: log instead of print print("Failed to get any metadata for file {}.".format(self.src_path)) print(e) img.close() @@ -439,7 +439,7 @@ class File(FileBase): img.close() # Catch decompression bombs except Exception as e: - # TODO: log instead of print + # LOG: log instead of print print("Caught exception processing metadata for {}".format(self.src_path)) print(e) self.make_dangerous() @@ -476,7 +476,7 @@ class File(FileBase): temporary directory on dest key, opens the using PIL.Image,saves it to the temporary directory, and copies it to the destination.""" # TODO: make sure this method works for png, gif, tiff - if self.has_metadata(): + if self.has_metadata: self.extract_metadata() tempdir_path = self.make_tempdir() tempfile_path = os.path.join(tempdir_path, self.filename) @@ -488,7 +488,7 @@ class File(FileBase): self.src_path = tempfile_path except Exception as e: # Catch decompression bombs # TODO: change this from all Exceptions to specific DecompressionBombWarning - # TODO: change this from printing to logging + # LOG: change this from printing to logging print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) print(e) self.make_dangerous() @@ -505,7 +505,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" - # TODO: we probably want to move this write_log elsewhere: + # LOG: we probably want to move this write_log elsewhere: # if self.recursive_archive_depth > 0: # self.write_log() # TODO: Can we clean up the way we handle relative_path? @@ -513,7 +513,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): dstpath = srcpath.replace(src_dir, dst_dir) relative_path = srcpath.replace(src_dir + '/', '') self.cur_file = File(srcpath, dstpath, self.logger) - # TODO: move this logging code elsewhere + # LOG: move this logging code elsewhere self.logger.log.info('Processing {} ({}/{})', relative_path, self.cur_file.main_type, @@ -546,7 +546,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): unpack_command = command_str.format(SEVENZ_PATH, file.src_path, tempdir_path) self._run_process(unpack_command) - # TODO: check that tree is working correctly here + # LOG: check that tree is working correctly here self.logger.tree(tempdir_path) self.process_dir(tempdir_path, file.dst_path) self._safe_rmtree(tempdir_path) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 2efde64..a2c17a2 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -173,12 +173,12 @@ class FileBase(object): self.metadata_file_path = self.dst_path + ext return self.metadata_file_path except KittenGroomerError as e: - # TODO: Write to log file + # LOG: Write to log file return False -class GroomerLog(object): - """Groomer logging object""" +class GroomerLogger(object): + """Groomer logging interface""" def __init__(self, root_dir, debug=False): self.log_dir_path = os.path.join(root_dir, 'logs') @@ -223,8 +223,8 @@ class GroomerLog(object): s.update(buf) return s.hexdigest() - def add_file(self): - # File object will add itself + def add_file(self, file): + # File object will add itself? # return a sublog for the file pass @@ -241,7 +241,7 @@ class KittenGroomerBase(object): self.dst_root_dir = root_dst self.debug = debug self.cur_file = None - self.logger = GroomerLog(self.dst_root_dir, debug) + self.logger = GroomerLogger(self.dst_root_dir, debug) # ##### Helpers ##### def _safe_rmtree(self, directory): @@ -271,7 +271,7 @@ class KittenGroomerBase(object): shutil.copy(src, dst) return True except Exception as e: - # TODO: Logfile + # LOG: Logfile print(e) return False From b6c01db1fb004f8820f59d199f65c47aa0db4897 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Fri, 3 Mar 2017 15:46:37 -0500 Subject: [PATCH 22/36] Split mimetype methods - Instead of one large function that mutates FileBase properties, mimetype and main/subtype are determined by two separate methods that return mimetypes. - The API is not changed. - Absence of mimetype is now None instead of an empty string. --- kittengroomer/helpers.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index a2c17a2..9ba2b5e 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -27,7 +27,6 @@ class KittenGroomerError(Exception): class ImplementationRequired(KittenGroomerError): """Implementation required error.""" - pass @@ -46,7 +45,9 @@ class FileBase(object): self.log_details = {'filepath': self.src_path} self.log_string = '' self.extension = self._determine_extension() - self._determine_mimetype() + self.mimetype = self._determine_mimetype() + if self.mimetype: + self.main_type, self.sub_type = self._split_subtypes(self.mimetype) self.logger = logger self.filename = os.path.basename(self.src_path) @@ -57,24 +58,27 @@ class FileBase(object): def _determine_mimetype(self): if os.path.islink(self.src_path): # magic will throw an IOError on a broken symlink - self.mimetype = 'inode/symlink' + mimetype = 'inode/symlink' else: try: mt = magic.from_file(self.src_path, mime=True) # Note: magic will always return something, even if it's just 'data' except UnicodeEncodeError as e: # FIXME: The encoding of the file is broken (possibly UTF-16) - mt = '' + mt = None self.log_details.update({'UnicodeError': e}) try: - self.mimetype = mt.decode("utf-8") + mimetype = mt.decode("utf-8") except: - self.mimetype = mt - if self.mimetype and '/' in self.mimetype: - self.main_type, self.sub_type = self.mimetype.split('/') + mimetype = mt + return mimetype + + def _split_subtypes(self, mimetype): + if '/' in mimetype: + main_type, sub_type = mimetype.split('/') else: - self.main_type = '' - self.sub_type = '' + main_type, sub_type = None, None + return main_type, sub_type def has_mimetype(self): """ @@ -243,7 +247,6 @@ class KittenGroomerBase(object): self.cur_file = None self.logger = GroomerLogger(self.dst_root_dir, debug) - # ##### Helpers ##### def _safe_rmtree(self, directory): """Remove a directory tree if it exists.""" if os.path.exists(directory): From 1c58a7347e92b9a5378bed44ecf357eafb83b94f Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Mon, 6 Mar 2017 14:55:33 -0500 Subject: [PATCH 23/36] If no extentions FileBase.ext is now None --- kittengroomer/helpers.py | 5 ++++- tests/test_kittengroomer.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 9ba2b5e..3e270dc 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -53,7 +53,10 @@ class FileBase(object): def _determine_extension(self): _, ext = os.path.splitext(self.src_path) - return ext.lower() + ext = ext.lower() + if ext == '': + ext = None + return ext def _determine_mimetype(self): if os.path.islink(self.src_path): diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index 8339e84..bb5ab5d 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -130,7 +130,6 @@ class TestFileBase: def test_has_extension(self, temp_file, temp_file_no_ext): assert temp_file.has_extension() is True assert temp_file_no_ext.has_extension() is False - assert temp_file_no_ext.log_details.get('no_extension') is True def test_add_log_details(self, generic_conf_file): generic_conf_file.add_log_details('test', True) From 12d5624b4dcf37c94847582e923493b7cca6488e Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Mon, 6 Mar 2017 15:02:29 -0500 Subject: [PATCH 24/36] Change FileBase.log_details to Filebase._file_props * _file_props is a dict that will hold all information about the file * Updated filecheck.py to reflect this * Potentially will change contents of file_props to being attributes on the file in the future. This change would be easy since all access to _file_props is now via set_property and get_property methods. * Add filename to _file_props --- bin/filecheck.py | 75 +++++++++++++++---------------- kittengroomer/__init__.py | 2 +- kittengroomer/helpers.py | 88 +++++++++++++++++++++---------------- tests/test_kittengroomer.py | 67 ++++++++++++++-------------- 4 files changed, 121 insertions(+), 111 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index ccf0b0e..a49297d 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -86,9 +86,6 @@ class File(FileBase): def __init__(self, src_path, dst_path, logger): super(File, self).__init__(src_path, dst_path, logger) self.is_recursive = False - self.log_details.update({'maintype': self.main_type, - 'subtype': self.sub_type, - 'extension': self.extension}) subtypes_apps = [ (Config.mimes_office, self._winoffice), @@ -128,7 +125,7 @@ class File(FileBase): if not self.has_extension(): self.make_dangerous() if self.extension in Config.malicious_exts: - self.log_details.update({'malicious_extension': self.extension}) + self.set_property('malicious_extension', self.extension) self.make_dangerous() def _check_extension(self): @@ -146,7 +143,7 @@ class File(FileBase): expected_mimetype = Config.aliases[expected_mimetype] is_known_extension = self.extension in mimetypes.types_map.keys() if is_known_extension and expected_mimetype != self.mimetype: - self.log_details.update({'expected_mimetype': expected_mimetype}) + self.set_property('expected_mimetype', expected_mimetype) self.make_dangerous() def _check_mimetype(self): @@ -161,7 +158,7 @@ class File(FileBase): strict=False) if expected_extensions: if self.has_extension() and self.extension not in expected_extensions: - self.log_details.update({'expected_extensions': expected_extensions}) + self.set_property('expected_extensions', expected_extensions) self.make_dangerous() def check(self): @@ -182,11 +179,11 @@ class File(FileBase): def write_log(self): """Print the logs related to the current file being processed.""" - # LOG: move to helpers.py? - tmp_log = self.logger.log.fields(**self.log_details) + # LOG: move to helpers.py and change + tmp_log = self.logger.log.fields(**self._file_props) if self.is_dangerous(): tmp_log.warning(self.log_string) - elif self.log_details.get('unknown') or self.log_details.get('binary'): + elif self.get_property('unknown') or self.get_property('binary'): tmp_log.info(self.log_string) else: tmp_log.debug(self.log_string) @@ -209,7 +206,7 @@ class File(FileBase): def inode(self): """Empty file or symlink.""" if self.is_symlink(): - self.log_string += 'Symlink to {}'.format(self.log_details['symlink']) + self.log_string += 'Symlink to {}'.format(self.get_property('symlink')) else: self.log_string += 'Inode file' @@ -265,12 +262,12 @@ class File(FileBase): def _executables(self): """Processes an executable file.""" - self.add_log_details('processing_type', 'executable') + self.set_property('processing_type', 'executable') self.make_dangerous() def _winoffice(self): """Processes a winoffice file using olefile/oletools.""" - self.add_log_details('processing_type', 'WinOffice') + self.set_property('processing_type', 'WinOffice') # Try as if it is a valid document oid = oletools.oleid.OleID(self.src_path) if not olefile.isOleFile(self.src_path): @@ -278,37 +275,37 @@ class File(FileBase): try: ole = olefile.OleFileIO(self.src_path, raise_defects=olefile.DEFECT_INCORRECT) except: - self.add_log_details('not_parsable', True) + self.set_property('not_parsable', True) self.make_dangerous() if ole.parsing_issues: - self.add_log_details('parsing_issues', True) + self.set_property('parsing_issues', True) self.make_dangerous() else: if ole.exists('macros/vba') or ole.exists('Macros') \ or ole.exists('_VBA_PROJECT_CUR') or ole.exists('VBA'): - self.add_log_details('macro', True) + self.set_property('macro', True) self.make_dangerous() else: indicators = oid.check() # Encrypted ban be set by multiple checks on the script if oid.encrypted.value: - self.add_log_details('encrypted', True) + self.set_property('encrypted', True) self.make_dangerous() if oid.macros.value or oid.ole.exists('macros/vba') or oid.ole.exists('Macros') \ or oid.ole.exists('_VBA_PROJECT_CUR') or oid.ole.exists('VBA'): - self.add_log_details('macro', True) + self.set_property('macro', True) self.make_dangerous() for i in indicators: if i.id == 'ObjectPool' and i.value: # FIXME: Is it suspicious? - self.add_log_details('objpool', True) + self.set_property('objpool', True) elif i.id == 'flash' and i.value: - self.add_log_details('flash', True) + self.set_property('flash', True) self.make_dangerous() def _ooxml(self): """Processes an ooxml file.""" - self.add_log_details('processing_type', 'ooxml') + self.set_property('processing_type', 'ooxml') try: doc = officedissector.doc.Document(self.src_path) except Exception: @@ -318,56 +315,56 @@ class File(FileBase): # There are probably other potentially malicious features: # fonts, custom props, custom XML if doc.is_macro_enabled or len(doc.features.macros) > 0: - self.add_log_details('macro', True) + self.set_property('macro', True) self.make_dangerous() if len(doc.features.embedded_controls) > 0: - self.add_log_details('activex', True) + self.set_property('activex', True) self.make_dangerous() if len(doc.features.embedded_objects) > 0: # Exploited by CVE-2014-4114 (OLE) - self.add_log_details('embedded_obj', True) + self.set_property('embedded_obj', True) self.make_dangerous() if len(doc.features.embedded_packages) > 0: - self.add_log_details('embedded_pack', True) + self.set_property('embedded_pack', True) self.make_dangerous() def _libreoffice(self): """Processes a libreoffice file.""" - self.add_log_details('processing_type', 'libreoffice') + self.set_property('processing_type', 'libreoffice') # As long as there ar no way to do a sanity check on the files => dangerous try: lodoc = zipfile.ZipFile(self.src_path, 'r') except: # TODO: are there specific exceptions we should catch here? Or is anything ok - self.add_log_details('invalid', True) + self.set_property('invalid', True) self.make_dangerous() for f in lodoc.infolist(): fname = f.filename.lower() if fname.startswith('script') or fname.startswith('basic') or \ fname.startswith('object') or fname.endswith('.bin'): - self.add_log_details('macro', True) + self.set_property('macro', True) self.make_dangerous() def _pdf(self): """Processes a PDF file.""" - self.add_log_details('processing_type', 'pdf') + self.set_property('processing_type', 'pdf') xmlDoc = PDFiD(self.src_path) oPDFiD = cPDFiD(xmlDoc, True) # TODO: are there other characteristics which should be dangerous? if oPDFiD.encrypt.count > 0: - self.add_log_details('encrypted', True) + self.set_property('encrypted', True) self.make_dangerous() if oPDFiD.js.count > 0 or oPDFiD.javascript.count > 0: - self.add_log_details('javascript', True) + self.set_property('javascript', True) self.make_dangerous() if oPDFiD.aa.count > 0 or oPDFiD.openaction.count > 0: - self.add_log_details('openaction', True) + self.set_property('openaction', True) self.make_dangerous() if oPDFiD.richmedia.count > 0: - self.add_log_details('flash', True) + self.set_property('flash', True) self.make_dangerous() if oPDFiD.launch.count > 0: - self.add_log_details('launch', True) + self.set_property('launch', True) self.make_dangerous() def _archive(self): @@ -375,7 +372,7 @@ class File(FileBase): temporary directory and self.process_dir is called on that directory. The recursive archive depth is increased to protect against archive bombs.""" - self.add_log_details('processing_type', 'archive') + self.set_property('processing_type', 'archive') self.is_recursive = True # self.log_string += 'Archive extracted, processing content.' @@ -422,7 +419,7 @@ class File(FileBase): with open(metadata_file_path, 'w+') as metadata_file: metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable)) - self.add_log_details('metadata', 'exif') + self.set_property('metadata', 'exif') img.close() return True @@ -435,7 +432,7 @@ class File(FileBase): if tag not in ('icc_profile'): with open(metadata_file_path, 'w+') as metadata_file: metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag])) - self.add_log_details('metadata', 'png') + self.set_property('metadata', 'png') img.close() # Catch decompression bombs except Exception as e: @@ -467,7 +464,7 @@ class File(FileBase): def _media_processing(self): """Generic way to process all media files.""" - self.add_log_details('processing_type', 'media') + self.set_property('processing_type', 'media') def image(self): """Processes an image. @@ -493,7 +490,7 @@ class File(FileBase): print(e) self.make_dangerous() self.log_string += 'Image file' - self.add_log_details('processing_type', 'image') + self.set_property('processing_type', 'image') class KittenGroomerFileCheck(KittenGroomerBase): @@ -554,7 +551,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def _handle_archivebomb(self, file): file.make_dangerous() - file.add_log_details('Archive Bomb', True) + file.set_property('Archive Bomb', True) self.logger.log.warning('ARCHIVE BOMB.') self.logger.log.warning('The content of the archive contains recursively other archives.') self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.') diff --git a/kittengroomer/__init__.py b/kittengroomer/__init__.py index 262988a..8428553 100644 --- a/kittengroomer/__init__.py +++ b/kittengroomer/__init__.py @@ -1,4 +1,4 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from .helpers import FileBase, KittenGroomerBase, GroomerLog, main +from .helpers import FileBase, KittenGroomerBase, GroomerLogger, main diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 3e270dc..caaedff 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -41,15 +41,18 @@ class FileBase(object): """Initialized with the source path and expected destination path.""" self.src_path = src_path self.dst_path = dst_path - # TODO: rename this to file_properties (and change in other groomers) - self.log_details = {'filepath': self.src_path} + self.filename = os.path.basename(self.src_path) self.log_string = '' + self.logger = logger self.extension = self._determine_extension() self.mimetype = self._determine_mimetype() if self.mimetype: self.main_type, self.sub_type = self._split_subtypes(self.mimetype) - self.logger = logger - self.filename = os.path.basename(self.src_path) + self._file_props = {'filepath': self.src_path, + 'filename': self.filename, + 'maintype': self.main_type, + 'subtype': self.sub_type, + 'extension': self.extension} def _determine_extension(self): _, ext = os.path.splitext(self.src_path) @@ -68,8 +71,8 @@ class FileBase(object): # Note: magic will always return something, even if it's just 'data' except UnicodeEncodeError as e: # FIXME: The encoding of the file is broken (possibly UTF-16) + # LOG: log this error mt = None - self.log_details.update({'UnicodeError': e}) try: mimetype = mt.decode("utf-8") except: @@ -84,50 +87,50 @@ class FileBase(object): return main_type, sub_type def has_mimetype(self): - """ - Returns True if file has a full mimetype, else False. - - Returns False + updates log if self.main_type or self.sub_type - are not set. - """ + """Returns True if file has a full mimetype, else False.""" if not self.main_type or not self.sub_type: - self.log_details.update({'broken_mime': True}) + # LOG: log this as an error + # self._file_props.update({'broken_mime': True}) return False - return True + else: + return True def has_extension(self): - """ - Returns True if self.extension is set, else False. - - Returns False + updates self.log_details if self.extension is not set. - """ - if self.extension == '': - self.log_details.update({'no_extension': True}) + """Returns True if self.extension is set, else False.""" + if self.extension is None: + # TODO: move this props updating to some other method + # self.set_property('no_extension', True) return False - return True + else: + return True def is_dangerous(self): - """Returns True if self.log_details contains 'dangerous'.""" - return ('dangerous' in self.log_details) + """True if file has been marked 'dangerous' else False.""" + return ('dangerous' in self._file_props) def is_unknown(self): - """Returns True if self.log_details contains 'unknown'.""" - return ('unknown' in self.log_details) + """True if file has been marked 'unknown' else False.""" + return ('unknown' in self._file_props) def is_binary(self): - """returns True if self.log_details contains 'binary'.""" - return ('binary' in self.log_details) + """True if file has been marked 'binary' else False.""" + return ('binary' in self._file_props) def is_symlink(self): """Returns True and updates log if file is a symlink.""" if self.has_mimetype() and self.main_type == 'inode' and self.sub_type == 'symlink': - self.log_details.update({'symlink': os.readlink(self.src_path)}) + # TODO: move this props updating somewhere else + # self.set_property('symlink', os.readlink(self.src_path)) return True - return False + else: + return False - def add_log_details(self, key, value): - """Takes a key + a value and adds them to self.log_details.""" - self.log_details[key] = value + def set_property(self, prop_string, value): + """Takes a property + a value and adds them to self._file_props.""" + self._file_props[prop_string] = value + + def get_property(self, file_prop): + return self._file_props.get(file_prop, None) def make_dangerous(self): """ @@ -138,7 +141,7 @@ class FileBase(object): """ if self.is_dangerous(): return - self.log_details['dangerous'] = True + self.set_property('dangerous', True) path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename)) @@ -146,7 +149,7 @@ class FileBase(object): """Marks a file as an unknown type and prepends UNKNOWN to filename.""" if self.is_dangerous() or self.is_binary(): return - self.log_details['unknown'] = True + self.set_property('unknown', True) path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename)) @@ -154,15 +157,17 @@ class FileBase(object): """Marks a file as a binary and appends .bin to filename.""" if self.is_dangerous(): return - self.log_details['binary'] = True + self.set_property('binary', True) path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, '{}.bin'.format(filename)) def force_ext(self, ext): - """If dst_path does not end in ext, appends the ext and updates log.""" + """If dst_path does not end in ext, changes it and edits _file_props.""" if not self.dst_path.endswith(ext): - self.log_details['force_ext'] = True + self.set_property('force_ext', True) self.dst_path += ext + if not self._file_props['extension'] == ext: + self.set_property('extension', ext) def create_metadata_file(self, ext): """Create a separate file to hold this file's metadata.""" @@ -187,8 +192,9 @@ class FileBase(object): class GroomerLogger(object): """Groomer logging interface""" - def __init__(self, root_dir, debug=False): - self.log_dir_path = os.path.join(root_dir, 'logs') + def __init__(self, root_dir_path, debug=False): + self.root_dir = root_dir_path + self.log_dir_path = os.path.join(root_dir_path, 'logs') if os.path.exists(self.log_dir_path): shutil.rmtree(self.log_dir_path) os.makedirs(self.log_dir_path) @@ -252,21 +258,25 @@ class KittenGroomerBase(object): def _safe_rmtree(self, directory): """Remove a directory tree if it exists.""" + # TODO: should be a public method if os.path.exists(directory): shutil.rmtree(directory) def _safe_remove(self, filepath): """Remove a file if it exists.""" + # TODO: should be a public method if os.path.exists(filepath): os.remove(filepath) def _safe_mkdir(self, directory): """Make a directory if it does not exist.""" + # TODO: should be a public method if not os.path.exists(directory): os.makedirs(directory) def _safe_copy(self, src=None, dst=None): """Copy a file and create directory if needed.""" + # TODO: should be a public method if src is None: src = self.cur_file.src_path if dst is None: diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index bb5ab5d..4497728 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -5,7 +5,7 @@ import os import pytest -from kittengroomer import FileBase, KittenGroomerBase, GroomerLog +from kittengroomer import FileBase, KittenGroomerBase, GroomerLogger from kittengroomer.helpers import ImplementationRequired skip = pytest.mark.skip @@ -95,13 +95,10 @@ class TestFileBase: def test_init(self, generic_conf_file): file = generic_conf_file - assert file.log_details - assert file.log_details['filepath'] == file.src_path - assert file.extension == '.conf' - copied_log = file.log_details.copy() - file.log_details = '' - # assert file.log_details == copied_log # this fails for now, we need to make log_details undeletable - # we should probably check for more extensions here + assert file.get_property('filepath') == file.src_path + assert file.get_property('extension') == '.conf' + # TODO: should we make log_details undeletable? + # TODO: we should probably check for more extensions here def test_extension_uppercase(self, tmpdir): file_path = tmpdir.join('TEST.TXT') @@ -129,13 +126,13 @@ class TestFileBase: def test_has_extension(self, temp_file, temp_file_no_ext): assert temp_file.has_extension() is True + print(temp_file_no_ext.extension) assert temp_file_no_ext.has_extension() is False - def test_add_log_details(self, generic_conf_file): - generic_conf_file.add_log_details('test', True) - assert generic_conf_file.log_details['test'] is True - with pytest.raises(KeyError): - assert generic_conf_file.log_details['wrong'] is False + def test_set_property(self, generic_conf_file): + generic_conf_file.set_property('test', True) + assert generic_conf_file.get_property('test') is True + assert generic_conf_file.get_property('wrong') is None def test_marked_dangerous(self, file_marked_all_parameterized): file_marked_all_parameterized.make_dangerous() @@ -164,52 +161,54 @@ class TestFileBase: assert symlink.is_symlink() is True def test_generic_make_unknown(self, generic_conf_file): - assert generic_conf_file.log_details.get('unknown') is None + assert generic_conf_file.is_unknown() is False generic_conf_file.make_unknown() - assert generic_conf_file.log_details.get('unknown') is True + assert generic_conf_file.is_unknown() # given a FileBase object with no marking, should do the right things def test_marked_make_unknown(self, file_marked_all_parameterized): file = file_marked_all_parameterized - if file.log_details.get('unknown'): + if file.is_unknown(): file.make_unknown() - assert file.log_details.get('unknown') is True + assert file.is_unknown() else: - assert file.log_details.get('unknown') is None + assert file.is_unknown() is False file.make_unknown() - assert file.log_details.get('unknown') is None + assert file.is_unknown() is False # given a FileBase object with an unrecognized marking, should ??? def test_generic_make_binary(self, generic_conf_file): - assert generic_conf_file.log_details.get('binary') is None + assert generic_conf_file.is_binary() is False generic_conf_file.make_binary() - assert generic_conf_file.log_details.get('binary') is True + assert generic_conf_file.is_binary() is True def test_marked_make_binary(self, file_marked_all_parameterized): file = file_marked_all_parameterized - if file.log_details.get('dangerous'): + if file.is_dangerous(): file.make_binary() - assert file.log_details.get('binary') is None + assert file.is_binary() is False + assert file.get_property('binary') is None else: file.make_binary() - assert file.log_details.get('binary') is True + assert file.is_binary() + assert file.get_property('binary') is True def test_force_ext_change(self, generic_conf_file): assert generic_conf_file.has_extension() - assert generic_conf_file.extension == '.conf' + assert generic_conf_file.get_property('extension') == '.conf' assert os.path.splitext(generic_conf_file.dst_path)[1] == '.conf' generic_conf_file.force_ext('.txt') assert os.path.splitext(generic_conf_file.dst_path)[1] == '.txt' - assert generic_conf_file.log_details.get('force_ext') is True - # should make a file's extension change + assert generic_conf_file.get_property('force_ext') is True + assert generic_conf_file.get_property('extension') == '.txt' # should be able to handle weird paths def test_force_ext_correct(self, generic_conf_file): assert generic_conf_file.has_extension() - assert generic_conf_file.extension == '.conf' + assert generic_conf_file.get_property('extension') == '.conf' generic_conf_file.force_ext('.conf') assert os.path.splitext(generic_conf_file.dst_path)[1] == '.conf' - assert generic_conf_file.log_details.get('force_ext') is None + assert generic_conf_file.get_property('force_ext') is None # shouldn't change a file's extension if it already is right def test_create_metadata_file(self, temp_file): @@ -224,9 +223,13 @@ class TestFileBase: class TestLogger: - @xfail - def test_tree(self, tmpdir): - GroomerLog.tree(tmpdir) + + @fixture + def generic_logger(self, tmpdir): + return GroomerLogger(tmpdir.strpath) + + def test_tree(self, generic_logger): + generic_logger.tree(generic_logger.root_dir) class TestKittenGroomerBase: From fc8923fddd10430c9706edc897ed7585c2b4db2c Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Tue, 7 Mar 2017 15:31:32 -0500 Subject: [PATCH 25/36] Change Groomer private methods to public * Changed safe_rmtree, safe_copy, safe_remove, and safe_mkdir to public methods * If something is being used in a subclass it probably shouldn't be a private method --- bin/filecheck.py | 6 +++--- kittengroomer/helpers.py | 14 +++++--------- tests/test_kittengroomer.py | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index a49297d..a0c4f6f 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -523,10 +523,10 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.process_archive(file) else: # TODO: Make a File attribute for should be copied True/False and check it - self._safe_copy() + self.safe_copy() file.write_log() if hasattr(file, "tempdir_path"): - self._safe_rmtree(file.tempdir_path) + self.safe_rmtree(file.tempdir_path) def process_archive(self, file): """Unpacks an archive using 7zip and processes contents. @@ -546,7 +546,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): # LOG: check that tree is working correctly here self.logger.tree(tempdir_path) self.process_dir(tempdir_path, file.dst_path) - self._safe_rmtree(tempdir_path) + self.safe_rmtree(tempdir_path) self.recursive_archive_depth -= 1 def _handle_archivebomb(self, file): diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index caaedff..cf9e098 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -256,34 +256,30 @@ class KittenGroomerBase(object): self.cur_file = None self.logger = GroomerLogger(self.dst_root_dir, debug) - def _safe_rmtree(self, directory): + def safe_rmtree(self, directory): """Remove a directory tree if it exists.""" - # TODO: should be a public method if os.path.exists(directory): shutil.rmtree(directory) - def _safe_remove(self, filepath): + def safe_remove(self, filepath): """Remove a file if it exists.""" - # TODO: should be a public method if os.path.exists(filepath): os.remove(filepath) - def _safe_mkdir(self, directory): + def safe_mkdir(self, directory): """Make a directory if it does not exist.""" - # TODO: should be a public method if not os.path.exists(directory): os.makedirs(directory) - def _safe_copy(self, src=None, dst=None): + def safe_copy(self, src=None, dst=None): """Copy a file and create directory if needed.""" - # TODO: should be a public method if src is None: src = self.cur_file.src_path if dst is None: dst = self.cur_file.dst_path try: dst_path, filename = os.path.split(dst) - self._safe_mkdir(dst_path) + self.safe_mkdir(dst_path) shutil.copy(src, dst) return True except Exception as e: diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index 4497728..5866aaf 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -264,7 +264,7 @@ class TestKittenGroomerBase: filedest = testdir.join('test.txt') simple_groomer = KittenGroomerBase(tmpdir.strpath, testdir.strpath) simple_groomer.cur_file = FileBase(file.strpath, filedest.strpath) - assert simple_groomer._safe_copy() is True + assert simple_groomer.safe_copy() is True #check that it handles weird file path inputs def test_list_all_files(self, tmpdir): From 0038d3ef66a3ea9634848e86de38a52c36d274e1 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 8 Mar 2017 21:30:06 -0500 Subject: [PATCH 26/36] Switch to using file properties * make_dangerous now takes a description string * add_file_string takes strings describing the file --- bin/filecheck.py | 143 +++++++++++++++--------------------- kittengroomer/helpers.py | 81 +++++++++++++------- tests/test_filecheck.py | 3 + tests/test_kittengroomer.py | 40 ++++------ 4 files changed, 129 insertions(+), 138 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index a0c4f6f..47f9c02 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -120,13 +120,12 @@ class File(FileBase): } def _check_dangerous(self): - if not self.has_mimetype(): # No mimetype, should not happen. - self.make_dangerous() + if not self.has_mimetype(): + self.make_dangerous('no mimetype') if not self.has_extension(): - self.make_dangerous() + self.make_dangerous('no extension') if self.extension in Config.malicious_exts: - self.set_property('malicious_extension', self.extension) - self.make_dangerous() + self.make_dangerous('malicious_extension') def _check_extension(self): """Guesses the file's mimetype based on its extension. If the file's @@ -143,8 +142,8 @@ class File(FileBase): expected_mimetype = Config.aliases[expected_mimetype] is_known_extension = self.extension in mimetypes.types_map.keys() if is_known_extension and expected_mimetype != self.mimetype: - self.set_property('expected_mimetype', expected_mimetype) - self.make_dangerous() + # LOG: improve this string + self.make_dangerous('expected_mimetyped') def _check_mimetype(self): """Takes the mimetype (as determined by libmagic) and determines @@ -158,8 +157,8 @@ class File(FileBase): strict=False) if expected_extensions: if self.has_extension() and self.extension not in expected_extensions: - self.set_property('expected_extensions', expected_extensions) - self.make_dangerous() + # LOG: improve this string + self.make_dangerous('expected extensions') def check(self): self._check_dangerous() @@ -179,11 +178,11 @@ class File(FileBase): def write_log(self): """Print the logs related to the current file being processed.""" - # LOG: move to helpers.py and change + # LOG: move to helpers.py GroomerLogger and modify tmp_log = self.logger.log.fields(**self._file_props) if self.is_dangerous(): tmp_log.warning(self.log_string) - elif self.get_property('unknown') or self.get_property('binary'): + elif self.is_unknown() or self.is_binary(): tmp_log.info(self.log_string) else: tmp_log.debug(self.log_string) @@ -206,67 +205,71 @@ class File(FileBase): def inode(self): """Empty file or symlink.""" if self.is_symlink(): - self.log_string += 'Symlink to {}'.format(self.get_property('symlink')) + symlink_path = self.get_property('symlink') + self.add_file_string('Symlink to {}'.format(symlink_path)) else: - self.log_string += 'Inode file' + self.add_file_string('Inode file') def unknown(self): """Main type should never be unknown.""" - self.log_string += 'Unknown file' + self.add_file_string('Unknown file') def example(self): """Used in examples, should never be returned by libmagic.""" - self.log_string += 'Example file' + self.add_file_string('Example file') def multipart(self): """Used in web apps, should never be returned by libmagic""" - self.log_string += 'Multipart file' + self.add_file_string('Multipart file') # ##### Treated as malicious, no reason to have it on a USB key ###### def message(self): """Process a message file.""" - self.log_string += 'Message file' - self.make_dangerous() + self.add_file_string('Message file') + self.make_dangerous('Message file') def model(self): """Process a model file.""" - self.log_string += 'Model file' - self.make_dangerous() + self.add_file_string('Model file') + self.make_dangerous('Model file') # ##### Files that will be converted ###### def text(self): """Process an rtf, ooxml, or plaintext file.""" for mt in Config.mimes_rtf: if mt in self.sub_type: - self.log_string += 'Rich Text file' + self.add_file_string('Rich Text file') # TODO: need a way to convert it to plain text self.force_ext('.txt') return for mt in Config.mimes_ooxml: if mt in self.sub_type: - self.log_string += 'OOXML File' + self.add_file_string('OOXML File') self._ooxml() return - self.log_string += 'Text file' + self.add_file_string('Text file') self.force_ext('.txt') def application(self): """Processes an application specific file according to its subtype.""" for subtype, method in self.app_subtype_methods.items(): if subtype in self.sub_type: + # TODO: should this return a value? method() - self.log_string += 'Application file' + self.add_file_string('Application file') return - self.log_string += 'Unknown Application file' + self.add_file_string('Unknown Application file') self._unknown_app() def _executables(self): """Processes an executable file.""" + # LOG: change this property self.set_property('processing_type', 'executable') - self.make_dangerous() + self.make_dangerous('executable') def _winoffice(self): """Processes a winoffice file using olefile/oletools.""" + # LOG: change this property self.set_property('processing_type', 'WinOffice') # Try as if it is a valid document oid = oletools.oleid.OleID(self.src_path) @@ -275,33 +278,27 @@ class File(FileBase): try: ole = olefile.OleFileIO(self.src_path, raise_defects=olefile.DEFECT_INCORRECT) except: - self.set_property('not_parsable', True) - self.make_dangerous() + self.make_dangerous('not parsable') if ole.parsing_issues: - self.set_property('parsing_issues', True) - self.make_dangerous() + self.make_dangerous('parsing issues') else: if ole.exists('macros/vba') or ole.exists('Macros') \ or ole.exists('_VBA_PROJECT_CUR') or ole.exists('VBA'): - self.set_property('macro', True) - self.make_dangerous() + self.make_dangerous('macro') else: indicators = oid.check() - # Encrypted ban be set by multiple checks on the script + # Encrypted can be set by multiple checks on the script if oid.encrypted.value: - self.set_property('encrypted', True) - self.make_dangerous() + self.make_dangerous('encrypted') if oid.macros.value or oid.ole.exists('macros/vba') or oid.ole.exists('Macros') \ or oid.ole.exists('_VBA_PROJECT_CUR') or oid.ole.exists('VBA'): - self.set_property('macro', True) - self.make_dangerous() + self.make_dangerous('macro') for i in indicators: if i.id == 'ObjectPool' and i.value: - # FIXME: Is it suspicious? + # TODO: Is it suspicious? self.set_property('objpool', True) elif i.id == 'flash' and i.value: - self.set_property('flash', True) - self.make_dangerous() + self.make_dangerous('flash') def _ooxml(self): """Processes an ooxml file.""" @@ -309,24 +306,19 @@ class File(FileBase): try: doc = officedissector.doc.Document(self.src_path) except Exception: - # Invalid file - self.make_dangerous() + self.make_dangerous('invalid ooxml file') return # There are probably other potentially malicious features: # fonts, custom props, custom XML if doc.is_macro_enabled or len(doc.features.macros) > 0: - self.set_property('macro', True) - self.make_dangerous() + self.make_dangerous('macro') if len(doc.features.embedded_controls) > 0: - self.set_property('activex', True) - self.make_dangerous() + self.make_dangerous('activex') if len(doc.features.embedded_objects) > 0: # Exploited by CVE-2014-4114 (OLE) - self.set_property('embedded_obj', True) - self.make_dangerous() + self.make_dangerous('embedded obj') if len(doc.features.embedded_packages) > 0: - self.set_property('embedded_pack', True) - self.make_dangerous() + self.make_dangerous('embedded pack') def _libreoffice(self): """Processes a libreoffice file.""" @@ -336,14 +328,12 @@ class File(FileBase): lodoc = zipfile.ZipFile(self.src_path, 'r') except: # TODO: are there specific exceptions we should catch here? Or is anything ok - self.set_property('invalid', True) - self.make_dangerous() + self.make_dangerous('invalid libreoffice file') for f in lodoc.infolist(): fname = f.filename.lower() if fname.startswith('script') or fname.startswith('basic') or \ fname.startswith('object') or fname.endswith('.bin'): - self.set_property('macro', True) - self.make_dangerous() + self.make_dangerous('macro') def _pdf(self): """Processes a PDF file.""" @@ -352,20 +342,15 @@ class File(FileBase): oPDFiD = cPDFiD(xmlDoc, True) # TODO: are there other characteristics which should be dangerous? if oPDFiD.encrypt.count > 0: - self.set_property('encrypted', True) - self.make_dangerous() + self.make_dangerous('encrypted pdf') if oPDFiD.js.count > 0 or oPDFiD.javascript.count > 0: - self.set_property('javascript', True) - self.make_dangerous() + self.make_dangerous('pdf with javascript') if oPDFiD.aa.count > 0 or oPDFiD.openaction.count > 0: - self.set_property('openaction', True) - self.make_dangerous() + self.make_dangerous('openaction') if oPDFiD.richmedia.count > 0: - self.set_property('flash', True) - self.make_dangerous() + self.make_dangerous('flash') if oPDFiD.launch.count > 0: - self.set_property('launch', True) - self.make_dangerous() + self.make_dangerous('launch') def _archive(self): """Processes an archive using 7zip. The archive is extracted to a @@ -394,16 +379,12 @@ class File(FileBase): try: tags = exifread.process_file(img, debug=True) except Exception as e: - # LOG: log instead of print - print("Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path)) - print(e) + self.add_error(e, "Error while trying to grab full metadata for file {}; retrying for partial data.".format(self.src_path)) if tags is None: try: tags = exifread.process_file(img, debug=True) except Exception as e: - # LOG: log instead of print - print("Failed to get any metadata for file {}.".format(self.src_path)) - print(e) + self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path)) img.close() return False @@ -436,10 +417,8 @@ class File(FileBase): img.close() # Catch decompression bombs except Exception as e: - # LOG: log instead of print - print("Caught exception processing metadata for {}".format(self.src_path)) - print(e) - self.make_dangerous() + self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path)) + self.make_dangerous('exception processing metadata') return False def extract_metadata(self): @@ -485,11 +464,9 @@ class File(FileBase): self.src_path = tempfile_path except Exception as e: # Catch decompression bombs # TODO: change this from all Exceptions to specific DecompressionBombWarning - # LOG: change this from printing to logging - print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) - print(e) + self.add_error(e, "Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path)) self.make_dangerous() - self.log_string += 'Image file' + self.add_file_string('Image file') self.set_property('processing_type', 'image') @@ -521,11 +498,10 @@ class KittenGroomerFileCheck(KittenGroomerBase): file.check() if file.is_recursive: self.process_archive(file) - else: - # TODO: Make a File attribute for should be copied True/False and check it - self.safe_copy() + elif file.should_copy: + self.safe_copy(file.src_path, file.dst_path) file.write_log() - if hasattr(file, "tempdir_path"): + if hasattr(file, 'tempdir_path'): self.safe_rmtree(file.tempdir_path) def process_archive(self, file): @@ -550,8 +526,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.recursive_archive_depth -= 1 def _handle_archivebomb(self, file): - file.make_dangerous() - file.set_property('Archive Bomb', True) + file.make_dangerous('Archive bomb') self.logger.log.warning('ARCHIVE BOMB.') self.logger.log.warning('The content of the archive contains recursively other archives.') self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.') diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index cf9e098..cef4c8d 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -44,15 +44,26 @@ class FileBase(object): self.filename = os.path.basename(self.src_path) self.log_string = '' self.logger = logger + self.symlink = None self.extension = self._determine_extension() self.mimetype = self._determine_mimetype() + self.should_copy = True if self.mimetype: self.main_type, self.sub_type = self._split_subtypes(self.mimetype) - self._file_props = {'filepath': self.src_path, - 'filename': self.filename, - 'maintype': self.main_type, - 'subtype': self.sub_type, - 'extension': self.extension} + self._file_props = { + 'filepath': self.src_path, + 'filename': self.filename, + 'maintype': self.main_type, + 'subtype': self.sub_type, + 'extension': self.extension, + 'file_size': self.size, + 'safety_category': None, + 'symlink': self.symlink, + 'copied': False, + 'file_string_set': set(), + 'errors': {}, + 'user_defined': {} + } def _determine_extension(self): _, ext = os.path.splitext(self.src_path) @@ -65,13 +76,14 @@ class FileBase(object): if os.path.islink(self.src_path): # magic will throw an IOError on a broken symlink mimetype = 'inode/symlink' + self.symlink = os.readlink(self.src_path) else: try: mt = magic.from_file(self.src_path, mime=True) # Note: magic will always return something, even if it's just 'data' except UnicodeEncodeError as e: # FIXME: The encoding of the file is broken (possibly UTF-16) - # LOG: log this error + self.add_error(e, '') mt = None try: mimetype = mt.decode("utf-8") @@ -86,10 +98,14 @@ class FileBase(object): main_type, sub_type = None, None return main_type, sub_type + @property + def size(self): + return os.path.getsize(self.src_path) + def has_mimetype(self): """Returns True if file has a full mimetype, else False.""" if not self.main_type or not self.sub_type: - # LOG: log this as an error + # LOG: log this as an error, move somewhere else? # self._file_props.update({'broken_mime': True}) return False else: @@ -98,7 +114,7 @@ class FileBase(object): def has_extension(self): """Returns True if self.extension is set, else False.""" if self.extension is None: - # TODO: move this props updating to some other method + # TODO: do we actually want to have a seperate prop for no extension? # self.set_property('no_extension', True) return False else: @@ -106,33 +122,46 @@ class FileBase(object): def is_dangerous(self): """True if file has been marked 'dangerous' else False.""" - return ('dangerous' in self._file_props) + return self._file_props['safety_category'] is 'dangerous' def is_unknown(self): """True if file has been marked 'unknown' else False.""" - return ('unknown' in self._file_props) + return self._file_props['safety_category'] is 'unknown' def is_binary(self): """True if file has been marked 'binary' else False.""" - return ('binary' in self._file_props) + return self._file_props['safety_category'] is 'binary' def is_symlink(self): """Returns True and updates log if file is a symlink.""" - if self.has_mimetype() and self.main_type == 'inode' and self.sub_type == 'symlink': - # TODO: move this props updating somewhere else - # self.set_property('symlink', os.readlink(self.src_path)) - return True - else: + if self._file_props['symlink'] is None: return False + else: + return True def set_property(self, prop_string, value): """Takes a property + a value and adds them to self._file_props.""" - self._file_props[prop_string] = value + if prop_string in self._file_props.keys(): + self._file_props[prop_string] = value + else: + self._file_props['user_defined'][prop_string] = value def get_property(self, file_prop): - return self._file_props.get(file_prop, None) + # FIXME: needs to be refactored + if file_prop in self._file_props: + return self._file_props[file_prop] + elif file_prop in self._file_props['user_defined']: + return self._file_props['user_defined'][file_prop] + else: + return None - def make_dangerous(self): + def add_error(self, error, info): + self._file_props['errors'].update({error: info}) + + def add_file_string(self, file_string): + self._file_props['file_string_set'].add(file_string) + + def make_dangerous(self, reason_string=None): """ Marks a file as dangerous. @@ -141,7 +170,8 @@ class FileBase(object): """ if self.is_dangerous(): return - self.set_property('dangerous', True) + self.set_property('safety_category', 'dangerous') + # LOG: store reason string path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename)) @@ -149,7 +179,7 @@ class FileBase(object): """Marks a file as an unknown type and prepends UNKNOWN to filename.""" if self.is_dangerous() or self.is_binary(): return - self.set_property('unknown', True) + self.set_property('safety_category', 'unknown') path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename)) @@ -157,7 +187,7 @@ class FileBase(object): """Marks a file as a binary and appends .bin to filename.""" if self.is_dangerous(): return - self.set_property('binary', True) + self.set_property('safety_category', 'binary') path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, '{}.bin'.format(filename)) @@ -185,7 +215,7 @@ class FileBase(object): self.metadata_file_path = self.dst_path + ext return self.metadata_file_path except KittenGroomerError as e: - # LOG: Write to log file + self.add_error(e, '') return False @@ -281,11 +311,8 @@ class KittenGroomerBase(object): dst_path, filename = os.path.split(dst) self.safe_mkdir(dst_path) shutil.copy(src, dst) - return True except Exception as e: - # LOG: Logfile - print(e) - return False + self.add_error(e, '') def list_all_files(self, directory): """Generator yielding path to all of the files in a directory tree.""" diff --git a/tests/test_filecheck.py b/tests/test_filecheck.py index b573a64..f187523 100644 --- a/tests/test_filecheck.py +++ b/tests/test_filecheck.py @@ -43,6 +43,9 @@ class TestIntegration: test_description = "filecheck_valid" save_logs(groomer, test_description) + def test_processdir(self): + pass + class TestFileHandling: def test_autorun(self): diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index 5866aaf..41f9a40 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -30,7 +30,7 @@ class TestFileBase: return FileBase(source_file, dest_file) @fixture - def symlink(self, tmpdir): + def symlink_file(self, tmpdir): file_path = tmpdir.join('test.txt') file_path.write('testing') file_path = file_path.strpath @@ -65,7 +65,7 @@ class TestFileBase: @fixture def file_marked_binary(self, generic_conf_file): - generic_conf_file.mark_binary() + generic_conf_file.make_binary() return generic_conf_file @fixture(params=[ @@ -81,24 +81,17 @@ class TestFileBase: # What should FileBase do if it's given a path that isn't a file (doesn't exist or is a dir)? Currently magic throws an exception # We should probably catch everytime that happens and tell the user explicitly happened (and maybe put it in the log) - def test_create(self): - file = FileBase('tests/src_valid/blah.conf', '/tests/dst/blah.conf') - def test_create_broken(self, tmpdir): with pytest.raises(TypeError): - file_no_args = FileBase() + FileBase() with pytest.raises(FileNotFoundError): - file_empty_args = FileBase('', '') + FileBase('', '') with pytest.raises(IsADirectoryError): - file_directory = FileBase(tmpdir.strpath, tmpdir.strpath) - # are there other cases here? path to a file that doesn't exist? permissions? + FileBase(tmpdir.strpath, tmpdir.strpath) + # TODO: are there other cases here? path to a file that doesn't exist? permissions? def test_init(self, generic_conf_file): - file = generic_conf_file - assert file.get_property('filepath') == file.src_path - assert file.get_property('extension') == '.conf' - # TODO: should we make log_details undeletable? - # TODO: we should probably check for more extensions here + generic_conf_file def test_extension_uppercase(self, tmpdir): file_path = tmpdir.join('TEST.TXT') @@ -108,10 +101,10 @@ class TestFileBase: assert file.extension == '.txt' def test_mimetypes(self, generic_conf_file): - assert generic_conf_file.has_mimetype() assert generic_conf_file.mimetype == 'text/plain' assert generic_conf_file.main_type == 'text' assert generic_conf_file.sub_type == 'plain' + assert generic_conf_file.has_mimetype() # Need to test something without a mimetype # Need to test something that's a directory # Need to test something that causes the unicode exception @@ -151,14 +144,14 @@ class TestFileBase: file_path = file_path.strpath symlink_path = tmpdir.join('symlinked.txt') symlink_path = symlink_path.strpath - file_symlink = os.symlink(file_path, symlink_path) + os.symlink(file_path, symlink_path) file = FileBase(file_path, file_path) symlink = FileBase(symlink_path, symlink_path) assert file.is_symlink() is False assert symlink.is_symlink() is True - def test_has_symlink_fixture(self, symlink): - assert symlink.is_symlink() is True + def test_has_symlink_fixture(self, symlink_file): + assert symlink_file.is_symlink() is True def test_generic_make_unknown(self, generic_conf_file): assert generic_conf_file.is_unknown() is False @@ -187,11 +180,9 @@ class TestFileBase: if file.is_dangerous(): file.make_binary() assert file.is_binary() is False - assert file.get_property('binary') is None else: file.make_binary() assert file.is_binary() - assert file.get_property('binary') is True def test_force_ext_change(self, generic_conf_file): assert generic_conf_file.has_extension() @@ -254,7 +245,6 @@ class TestKittenGroomerBase: debug_groomer = KittenGroomerBase(source_directory, dest_directory, debug=True) - # we should maybe protect access to self.current_file in some way? def test_safe_copy(self, tmpdir): file = tmpdir.join('test.txt') @@ -264,8 +254,8 @@ class TestKittenGroomerBase: filedest = testdir.join('test.txt') simple_groomer = KittenGroomerBase(tmpdir.strpath, testdir.strpath) simple_groomer.cur_file = FileBase(file.strpath, filedest.strpath) - assert simple_groomer.safe_copy() is True - #check that it handles weird file path inputs + simple_groomer.safe_copy() + # check that safe copy can handle weird file path inputs def test_list_all_files(self, tmpdir): file = tmpdir.join('test.txt') @@ -276,7 +266,3 @@ class TestKittenGroomerBase: files = simple_groomer.list_all_files(simple_groomer.src_root_dir) assert file.strpath in files assert testdir.strpath not in files - - def test_processdir(self, generic_groomer): - with pytest.raises(ImplementationRequired): - generic_groomer.processdir(None, None) From 3fe8c7c223ac921dd596dbbd91b00425eca248fd Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 8 Mar 2017 23:06:20 -0500 Subject: [PATCH 27/36] Adjust order of property initialization Tests were failing due to values being set before file_props dict was created --- bin/filecheck.py | 6 ------ kittengroomer/helpers.py | 28 +++++++++++++++------------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 47f9c02..396d8b3 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -180,12 +180,6 @@ class File(FileBase): """Print the logs related to the current file being processed.""" # LOG: move to helpers.py GroomerLogger and modify tmp_log = self.logger.log.fields(**self._file_props) - if self.is_dangerous(): - tmp_log.warning(self.log_string) - elif self.is_unknown() or self.is_binary(): - tmp_log.info(self.log_string) - else: - tmp_log.debug(self.log_string) @property def has_metadata(self): diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index cef4c8d..aca99f8 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -42,28 +42,29 @@ class FileBase(object): self.src_path = src_path self.dst_path = dst_path self.filename = os.path.basename(self.src_path) - self.log_string = '' self.logger = logger - self.symlink = None - self.extension = self._determine_extension() - self.mimetype = self._determine_mimetype() - self.should_copy = True - if self.mimetype: - self.main_type, self.sub_type = self._split_subtypes(self.mimetype) self._file_props = { 'filepath': self.src_path, 'filename': self.filename, - 'maintype': self.main_type, - 'subtype': self.sub_type, - 'extension': self.extension, 'file_size': self.size, + 'maintype': None, + 'subtype': None, + 'extension': None, 'safety_category': None, - 'symlink': self.symlink, + 'symlink': False, 'copied': False, 'file_string_set': set(), 'errors': {}, 'user_defined': {} } + self.extension = self._determine_extension() + self.set_property('extension', self.extension) + self.mimetype = self._determine_mimetype() + self.should_copy = True + if self.mimetype: + self.main_type, self.sub_type = self._split_subtypes(self.mimetype) + self.set_property('maintype', self.main_type) + self.set_property('subtype', self.sub_type) def _determine_extension(self): _, ext = os.path.splitext(self.src_path) @@ -76,13 +77,14 @@ class FileBase(object): if os.path.islink(self.src_path): # magic will throw an IOError on a broken symlink mimetype = 'inode/symlink' - self.symlink = os.readlink(self.src_path) + self.set_property('symlink', os.readlink(self.src_path)) else: try: mt = magic.from_file(self.src_path, mime=True) # Note: magic will always return something, even if it's just 'data' except UnicodeEncodeError as e: # FIXME: The encoding of the file is broken (possibly UTF-16) + # One of the Travis files will trigger this. self.add_error(e, '') mt = None try: @@ -134,7 +136,7 @@ class FileBase(object): def is_symlink(self): """Returns True and updates log if file is a symlink.""" - if self._file_props['symlink'] is None: + if self._file_props['symlink'] is False: return False else: return True From 18857da7caa31a173906f984fc2963905eb67869 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 8 Mar 2017 23:22:53 -0500 Subject: [PATCH 28/36] Several small bugfixes * Fix issue with main/subtypes in init * Fix bug in File.check() in filecheck.py * Fix FileBase.size for symlinks --- bin/filecheck.py | 6 ++++-- kittengroomer/helpers.py | 14 +++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 396d8b3..144aed9 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -162,8 +162,10 @@ class File(FileBase): def check(self): self._check_dangerous() - self._check_extension() - self._check_mimetype() + if self.has_extension(): + self._check_extension() + if self.has_mimetype(): + self._check_mimetype() if not self.is_dangerous(): self.mime_processing_options.get(self.main_type, self.unknown)() diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index aca99f8..cf6309f 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -61,10 +61,14 @@ class FileBase(object): self.set_property('extension', self.extension) self.mimetype = self._determine_mimetype() self.should_copy = True + self.main_type = None + self.sub_type = None if self.mimetype: self.main_type, self.sub_type = self._split_subtypes(self.mimetype) - self.set_property('maintype', self.main_type) - self.set_property('subtype', self.sub_type) + if self.main_type: + self.set_property('maintype', self.main_type) + if self.sub_type: + self.set_property('subtype', self.sub_type) def _determine_extension(self): _, ext = os.path.splitext(self.src_path) @@ -102,7 +106,11 @@ class FileBase(object): @property def size(self): - return os.path.getsize(self.src_path) + try: + size = os.path.getsize(self.src_path) + except FileNotFoundError: + size = 0 + return size def has_mimetype(self): """Returns True if file has a full mimetype, else False.""" From 484c71fc860a1a435d070144a594acf03fb242e8 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 9 Mar 2017 01:41:42 -0500 Subject: [PATCH 29/36] Turn off copying for certain mimes in filecheck --- bin/filecheck.py | 4 ++++ kittengroomer/helpers.py | 1 + 2 files changed, 5 insertions(+) diff --git a/bin/filecheck.py b/bin/filecheck.py index 144aed9..171b5ff 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -205,18 +205,22 @@ class File(FileBase): self.add_file_string('Symlink to {}'.format(symlink_path)) else: self.add_file_string('Inode file') + self.should_copy = False def unknown(self): """Main type should never be unknown.""" self.add_file_string('Unknown file') + self.should_copy = False def example(self): """Used in examples, should never be returned by libmagic.""" self.add_file_string('Example file') + self.should_copy = False def multipart(self): """Used in web apps, should never be returned by libmagic""" self.add_file_string('Multipart file') + self.should_copy = False # ##### Treated as malicious, no reason to have it on a USB key ###### def message(self): diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index cf6309f..a358813 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -321,6 +321,7 @@ class KittenGroomerBase(object): dst_path, filename = os.path.split(dst) self.safe_mkdir(dst_path) shutil.copy(src, dst) + self.set_property('copied', True) except Exception as e: self.add_error(e, '') From e73721e95fc6e7f4f261685cdb559e77547f1510 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 9 Mar 2017 01:53:44 -0500 Subject: [PATCH 30/36] Fix bug with safe_copy --- bin/filecheck.py | 1 + kittengroomer/helpers.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 171b5ff..acc160d 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -500,6 +500,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.process_archive(file) elif file.should_copy: self.safe_copy(file.src_path, file.dst_path) + file.set_property('copied', True) file.write_log() if hasattr(file, 'tempdir_path'): self.safe_rmtree(file.tempdir_path) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index a358813..cf6309f 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -321,7 +321,6 @@ class KittenGroomerBase(object): dst_path, filename = os.path.split(dst) self.safe_mkdir(dst_path) shutil.copy(src, dst) - self.set_property('copied', True) except Exception as e: self.add_error(e, '') From 59cde8cfd5de1b5b097fd6c30a06e64d752cfbc0 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Thu, 9 Mar 2017 13:48:07 -0500 Subject: [PATCH 31/36] Move safe_copy to FileBase --- bin/filecheck.py | 2 +- kittengroomer/helpers.py | 30 ++++++++++++++---------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index acc160d..157c83f 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -499,7 +499,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): if file.is_recursive: self.process_archive(file) elif file.should_copy: - self.safe_copy(file.src_path, file.dst_path) + file.safe_copy() file.set_property('copied', True) file.write_log() if hasattr(file, 'tempdir_path'): diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index cf6309f..f422be5 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -201,6 +201,20 @@ class FileBase(object): path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, '{}.bin'.format(filename)) + def safe_copy(self, src=None, dst=None): + """Copy a file and create directory if needed.""" + if src is None: + src = self.src_path + if dst is None: + dst = self.dst_path + try: + dst_path, filename = os.path.split(dst) + if not os.path.exists(dst_path): + os.makedirs(dst_path) + shutil.copy(src, dst) + except Exception as e: + self.add_error(e, '') + def force_ext(self, ext): """If dst_path does not end in ext, changes it and edits _file_props.""" if not self.dst_path.endswith(ext): @@ -281,9 +295,6 @@ class GroomerLogger(object): # return a sublog for the file pass - def add_event(self, event, log_level): - pass - class KittenGroomerBase(object): """Base object responsible for copy/sanitization process.""" @@ -311,19 +322,6 @@ class KittenGroomerBase(object): if not os.path.exists(directory): os.makedirs(directory) - def safe_copy(self, src=None, dst=None): - """Copy a file and create directory if needed.""" - if src is None: - src = self.cur_file.src_path - if dst is None: - dst = self.cur_file.dst_path - try: - dst_path, filename = os.path.split(dst) - self.safe_mkdir(dst_path) - shutil.copy(src, dst) - except Exception as e: - self.add_error(e, '') - def list_all_files(self, directory): """Generator yielding path to all of the files in a directory tree.""" for root, dirs, files in os.walk(directory): From 963a2feef4643b0d8334a7a0d9cc17d86e360d7a Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Fri, 10 Mar 2017 13:13:38 -0500 Subject: [PATCH 32/36] Change various methods to properties --- bin/filecheck.py | 35 +++++++++------------ kittengroomer/helpers.py | 17 +++++++--- tests/test_kittengroomer.py | 63 +++++++++++++++++-------------------- 3 files changed, 56 insertions(+), 59 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 157c83f..8af9c2c 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -120,9 +120,9 @@ class File(FileBase): } def _check_dangerous(self): - if not self.has_mimetype(): + if not self.has_mimetype: self.make_dangerous('no mimetype') - if not self.has_extension(): + if not self.has_extension: self.make_dangerous('no extension') if self.extension in Config.malicious_exts: self.make_dangerous('malicious_extension') @@ -156,17 +156,17 @@ class File(FileBase): expected_extensions = mimetypes.guess_all_extensions(mimetype, strict=False) if expected_extensions: - if self.has_extension() and self.extension not in expected_extensions: + if self.has_extension and self.extension not in expected_extensions: # LOG: improve this string self.make_dangerous('expected extensions') def check(self): self._check_dangerous() - if self.has_extension(): + if self.has_extension: self._check_extension() - if self.has_mimetype(): + if self.has_mimetype: self._check_mimetype() - if not self.is_dangerous(): + if not self.is_dangerous: self.mime_processing_options.get(self.main_type, self.unknown)() # ##### Helper functions ##### @@ -178,11 +178,6 @@ class File(FileBase): dict_to_return[subtype] = method return dict_to_return - def write_log(self): - """Print the logs related to the current file being processed.""" - # LOG: move to helpers.py GroomerLogger and modify - tmp_log = self.logger.log.fields(**self._file_props) - @property def has_metadata(self): if self.mimetype in Config.mimes_metadata: @@ -200,7 +195,7 @@ class File(FileBase): # ##### Discarded mimetypes, reason in the docstring ###### def inode(self): """Empty file or symlink.""" - if self.is_symlink(): + if self.is_symlink: symlink_path = self.get_property('symlink') self.add_file_string('Symlink to {}'.format(symlink_path)) else: @@ -479,19 +474,19 @@ class KittenGroomerFileCheck(KittenGroomerBase): def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" - # LOG: we probably want to move this write_log elsewhere: + # LOG: what's the purpose of this write log?: # if self.recursive_archive_depth > 0: # self.write_log() - # TODO: Can we clean up the way we handle relative_path? + # We're writing the log here because... + # How exactly does the workflow work with an archive? for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) - relative_path = srcpath.replace(src_dir + '/', '') + # TODO: Can we clean up the way we handle relative_path? + # Relative path is here so that when we print files in the log it + # shows only the file's path. Should we just pass it to the logger + # when we create it? Or let the logger figure it out? + # relative_path = srcpath.replace(src_dir + '/', '') self.cur_file = File(srcpath, dstpath, self.logger) - # LOG: move this logging code elsewhere - self.logger.log.info('Processing {} ({}/{})', - relative_path, - self.cur_file.main_type, - self.cur_file.sub_type) self.process_file(self.cur_file) def process_file(self, file): diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index f422be5..9f80b72 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -112,6 +112,7 @@ class FileBase(object): size = 0 return size + @property def has_mimetype(self): """Returns True if file has a full mimetype, else False.""" if not self.main_type or not self.sub_type: @@ -121,6 +122,7 @@ class FileBase(object): else: return True + @property def has_extension(self): """Returns True if self.extension is set, else False.""" if self.extension is None: @@ -130,18 +132,22 @@ class FileBase(object): else: return True + @property def is_dangerous(self): """True if file has been marked 'dangerous' else False.""" return self._file_props['safety_category'] is 'dangerous' + @property def is_unknown(self): """True if file has been marked 'unknown' else False.""" return self._file_props['safety_category'] is 'unknown' + @property def is_binary(self): """True if file has been marked 'binary' else False.""" return self._file_props['safety_category'] is 'binary' + @property def is_symlink(self): """Returns True and updates log if file is a symlink.""" if self._file_props['symlink'] is False: @@ -178,7 +184,7 @@ class FileBase(object): Prepends and appends DANGEROUS to the destination file name to help prevent double-click of death. """ - if self.is_dangerous(): + if self.is_dangerous: return self.set_property('safety_category', 'dangerous') # LOG: store reason string @@ -187,7 +193,7 @@ class FileBase(object): def make_unknown(self): """Marks a file as an unknown type and prepends UNKNOWN to filename.""" - if self.is_dangerous() or self.is_binary(): + if self.is_dangerous or self.is_binary: return self.set_property('safety_category', 'unknown') path, filename = os.path.split(self.dst_path) @@ -195,7 +201,7 @@ class FileBase(object): def make_binary(self): """Marks a file as a binary and appends .bin to filename.""" - if self.is_dangerous(): + if self.is_dangerous: return self.set_property('safety_category', 'binary') path, filename = os.path.split(self.dst_path) @@ -242,6 +248,10 @@ class FileBase(object): self.add_error(e, '') return False + def write_log(self): + """Print the logs related to the current file being processed.""" + tmp_log = self.logger.log.fields(**self._file_props) + class GroomerLogger(object): """Groomer logging interface""" @@ -291,7 +301,6 @@ class GroomerLogger(object): return s.hexdigest() def add_file(self, file): - # File object will add itself? # return a sublog for the file pass diff --git a/tests/test_kittengroomer.py b/tests/test_kittengroomer.py index 41f9a40..940136d 100644 --- a/tests/test_kittengroomer.py +++ b/tests/test_kittengroomer.py @@ -104,23 +104,23 @@ class TestFileBase: assert generic_conf_file.mimetype == 'text/plain' assert generic_conf_file.main_type == 'text' assert generic_conf_file.sub_type == 'plain' - assert generic_conf_file.has_mimetype() + assert generic_conf_file.has_mimetype # Need to test something without a mimetype # Need to test something that's a directory # Need to test something that causes the unicode exception def test_has_mimetype_no_main_type(self, generic_conf_file): generic_conf_file.main_type = '' - assert generic_conf_file.has_mimetype() is False + assert generic_conf_file.has_mimetype is False def test_has_mimetype_no_sub_type(self, generic_conf_file): generic_conf_file.sub_type = '' - assert generic_conf_file.has_mimetype() is False + assert generic_conf_file.has_mimetype is False def test_has_extension(self, temp_file, temp_file_no_ext): - assert temp_file.has_extension() is True + assert temp_file.has_extension is True print(temp_file_no_ext.extension) - assert temp_file_no_ext.has_extension() is False + assert temp_file_no_ext.has_extension is False def test_set_property(self, generic_conf_file): generic_conf_file.set_property('test', True) @@ -129,14 +129,14 @@ class TestFileBase: def test_marked_dangerous(self, file_marked_all_parameterized): file_marked_all_parameterized.make_dangerous() - assert file_marked_all_parameterized.is_dangerous() is True + assert file_marked_all_parameterized.is_dangerous is True # Should work regardless of weird paths?? # Should check file path alteration behavior as well def test_generic_dangerous(self, generic_conf_file): - assert generic_conf_file.is_dangerous() is False + assert generic_conf_file.is_dangerous is False generic_conf_file.make_dangerous() - assert generic_conf_file.is_dangerous() is True + assert generic_conf_file.is_dangerous is True def test_has_symlink(self, tmpdir): file_path = tmpdir.join('test.txt') @@ -147,45 +147,45 @@ class TestFileBase: os.symlink(file_path, symlink_path) file = FileBase(file_path, file_path) symlink = FileBase(symlink_path, symlink_path) - assert file.is_symlink() is False - assert symlink.is_symlink() is True + assert file.is_symlink is False + assert symlink.is_symlink is True def test_has_symlink_fixture(self, symlink_file): - assert symlink_file.is_symlink() is True + assert symlink_file.is_symlink is True def test_generic_make_unknown(self, generic_conf_file): - assert generic_conf_file.is_unknown() is False + assert generic_conf_file.is_unknown is False generic_conf_file.make_unknown() - assert generic_conf_file.is_unknown() + assert generic_conf_file.is_unknown # given a FileBase object with no marking, should do the right things def test_marked_make_unknown(self, file_marked_all_parameterized): file = file_marked_all_parameterized - if file.is_unknown(): + if file.is_unknown: file.make_unknown() - assert file.is_unknown() + assert file.is_unknown else: - assert file.is_unknown() is False + assert file.is_unknown is False file.make_unknown() - assert file.is_unknown() is False + assert file.is_unknown is False # given a FileBase object with an unrecognized marking, should ??? def test_generic_make_binary(self, generic_conf_file): - assert generic_conf_file.is_binary() is False + assert generic_conf_file.is_binary is False generic_conf_file.make_binary() - assert generic_conf_file.is_binary() is True + assert generic_conf_file.is_binary def test_marked_make_binary(self, file_marked_all_parameterized): file = file_marked_all_parameterized - if file.is_dangerous(): + if file.is_dangerous: file.make_binary() - assert file.is_binary() is False + assert file.is_binary is False else: file.make_binary() - assert file.is_binary() + assert file.is_binary def test_force_ext_change(self, generic_conf_file): - assert generic_conf_file.has_extension() + assert generic_conf_file.has_extension assert generic_conf_file.get_property('extension') == '.conf' assert os.path.splitext(generic_conf_file.dst_path)[1] == '.conf' generic_conf_file.force_ext('.txt') @@ -195,7 +195,7 @@ class TestFileBase: # should be able to handle weird paths def test_force_ext_correct(self, generic_conf_file): - assert generic_conf_file.has_extension() + assert generic_conf_file.has_extension assert generic_conf_file.get_property('extension') == '.conf' generic_conf_file.force_ext('.conf') assert os.path.splitext(generic_conf_file.dst_path)[1] == '.conf' @@ -212,6 +212,10 @@ class TestFileBase: # if metadata file already exists # if there is no metadata to write should this work? + def test_safe_copy(self, generic_conf_file): + generic_conf_file.safe_copy() + # check that safe copy can handle weird file path inputs + class TestLogger: @@ -246,17 +250,6 @@ class TestKittenGroomerBase: dest_directory, debug=True) - def test_safe_copy(self, tmpdir): - file = tmpdir.join('test.txt') - file.write('testing') - testdir = tmpdir.join('testdir') - os.mkdir(testdir.strpath) - filedest = testdir.join('test.txt') - simple_groomer = KittenGroomerBase(tmpdir.strpath, testdir.strpath) - simple_groomer.cur_file = FileBase(file.strpath, filedest.strpath) - simple_groomer.safe_copy() - # check that safe copy can handle weird file path inputs - def test_list_all_files(self, tmpdir): file = tmpdir.join('test.txt') file.write('testing') From 0175ee48e594a77fd0bd115171b9c13c494d8119 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Tue, 14 Mar 2017 10:41:31 -0400 Subject: [PATCH 33/36] Add TODOs and clarify various logging messages --- bin/filecheck.py | 57 +++++++++++++++++----------------------- kittengroomer/helpers.py | 20 +++++++------- 2 files changed, 33 insertions(+), 44 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 8af9c2c..91b494e 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -143,7 +143,7 @@ class File(FileBase): is_known_extension = self.extension in mimetypes.types_map.keys() if is_known_extension and expected_mimetype != self.mimetype: # LOG: improve this string - self.make_dangerous('expected_mimetyped') + self.make_dangerous('expected_mimetype') def _check_mimetype(self): """Takes the mimetype (as determined by libmagic) and determines @@ -249,7 +249,8 @@ class File(FileBase): """Processes an application specific file according to its subtype.""" for subtype, method in self.app_subtype_methods.items(): if subtype in self.sub_type: - # TODO: should this return a value? + # TODO: should we change the logic so we don't iterate through all of the subtype methods? + # TODO: should these methods return a value? method() self.add_file_string('Application file') return @@ -258,16 +259,15 @@ class File(FileBase): def _executables(self): """Processes an executable file.""" - # LOG: change this property + # LOG: change the processing_type property to some other name or include in file_string self.set_property('processing_type', 'executable') self.make_dangerous('executable') def _winoffice(self): """Processes a winoffice file using olefile/oletools.""" - # LOG: change this property + # LOG: processing_type property self.set_property('processing_type', 'WinOffice') - # Try as if it is a valid document - oid = oletools.oleid.OleID(self.src_path) + oid = oletools.oleid.OleID(self.src_path) # First assume a valid file if not olefile.isOleFile(self.src_path): # Manual processing, may already count as suspicious try: @@ -291,12 +291,14 @@ class File(FileBase): for i in indicators: if i.id == 'ObjectPool' and i.value: # TODO: Is it suspicious? + # LOG: user defined property self.set_property('objpool', True) elif i.id == 'flash' and i.value: self.make_dangerous('flash') def _ooxml(self): """Processes an ooxml file.""" + # LOG: processing_type property self.set_property('processing_type', 'ooxml') try: doc = officedissector.doc.Document(self.src_path) @@ -318,7 +320,7 @@ class File(FileBase): def _libreoffice(self): """Processes a libreoffice file.""" self.set_property('processing_type', 'libreoffice') - # As long as there ar no way to do a sanity check on the files => dangerous + # As long as there is no way to do a sanity check on the files => dangerous try: lodoc = zipfile.ZipFile(self.src_path, 'r') except: @@ -332,6 +334,7 @@ class File(FileBase): def _pdf(self): """Processes a PDF file.""" + # LOG: processing_type property self.set_property('processing_type', 'pdf') xmlDoc = PDFiD(self.src_path) oPDFiD = cPDFiD(xmlDoc, True) @@ -352,9 +355,10 @@ class File(FileBase): temporary directory and self.process_dir is called on that directory. The recursive archive depth is increased to protect against archive bombs.""" + # LOG: change this to something archive specific self.set_property('processing_type', 'archive') + self.should_copy = False self.is_recursive = True - # self.log_string += 'Archive extracted, processing content.' def _unknown_app(self): """Processes an unknown file.""" @@ -367,10 +371,9 @@ class File(FileBase): ####################### # Metadata extractors def _metadata_exif(self, metadata_file_path): - # TODO: this method is kind of long, can we shorten it? + # TODO: this method is kind of long, can we shorten it somehow? img = open(self.src_path, 'rb') tags = None - try: tags = exifread.process_file(img, debug=True) except Exception as e: @@ -382,19 +385,17 @@ class File(FileBase): self.add_error(e, "Failed to get any metadata for file {}.".format(self.src_path)) img.close() return False - for tag in sorted(tags.keys()): - # These are long and obnoxious/binary + # These tags are long and obnoxious/binary so we don't add them if tag not in ('JPEGThumbnail', 'TIFFThumbnail'): - printable = str(tags[tag]) - + tag_string = str(tags[tag]) # Exifreader truncates data. - if len(printable) > 25 and printable.endswith(", ... ]"): - value = tags[tag].values - printable = str(value) - + if len(tag_string) > 25 and tag_string.endswith(", ... ]"): + tag_value = tags[tag].values + tag_string = str(tag_value) with open(metadata_file_path, 'w+') as metadata_file: - metadata_file.write("Key: {}\tValue: {}\n".format(tag, printable)) + metadata_file.write("Key: {}\tValue: {}\n".format(tag, tag_string)) + # LOG: how do we want to log metadata? self.set_property('metadata', 'exif') img.close() return True @@ -408,10 +409,12 @@ class File(FileBase): if tag not in ('icc_profile'): with open(metadata_file_path, 'w+') as metadata_file: metadata_file.write("Key: {}\tValue: {}\n".format(tag, img.info[tag])) + # LOG: handle metadata self.set_property('metadata', 'png') img.close() # Catch decompression bombs except Exception as e: + # TODO: only catch DecompressionBombWarnings here? self.add_error(e, "Caught exception processing metadata for {}".format(self.src_path)) self.make_dangerous('exception processing metadata') return False @@ -474,11 +477,6 @@ class KittenGroomerFileCheck(KittenGroomerBase): def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" - # LOG: what's the purpose of this write log?: - # if self.recursive_archive_depth > 0: - # self.write_log() - # We're writing the log here because... - # How exactly does the workflow work with an archive? for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) # TODO: Can we clean up the way we handle relative_path? @@ -506,9 +504,9 @@ class KittenGroomerFileCheck(KittenGroomerBase): Should be given a Kittengroomer file object whose src_path points to an archive.""" self.recursive_archive_depth += 1 - # Check for archivebomb + # LOG: write_log or somehow log the archive file here if self.recursive_archive_depth >= self.max_recursive_depth: - self._handle_archivebomb(file) + file.make_dangerous('Archive bomb') else: tempdir_path = file.make_tempdir() command_str = '{} -p1 x "{}" -o"{}" -bd -aoa' @@ -521,13 +519,6 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.safe_rmtree(tempdir_path) self.recursive_archive_depth -= 1 - def _handle_archivebomb(self, file): - file.make_dangerous('Archive bomb') - self.logger.log.warning('ARCHIVE BOMB.') - self.logger.log.warning('The content of the archive contains recursively other archives.') - self.logger.log.warning('This is a bad sign so the archive is not extracted to the destination key.') - # TODO: delete whatever we want to delete that's already been copied to dest dir - def _run_process(self, command_string, timeout=None): """Run command_string in a subprocess, wait until it finishes.""" args = shlex.split(command_string) diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 9f80b72..465ad4a 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -88,7 +88,7 @@ class FileBase(object): # Note: magic will always return something, even if it's just 'data' except UnicodeEncodeError as e: # FIXME: The encoding of the file is broken (possibly UTF-16) - # One of the Travis files will trigger this. + # Note: one of the Travis files will trigger this exception self.add_error(e, '') mt = None try: @@ -115,9 +115,9 @@ class FileBase(object): @property def has_mimetype(self): """Returns True if file has a full mimetype, else False.""" + # TODO: broken mimetype checks should be done somewhere else. + # Should the check be by default or should we let the API consumer write it? if not self.main_type or not self.sub_type: - # LOG: log this as an error, move somewhere else? - # self._file_props.update({'broken_mime': True}) return False else: return True @@ -126,8 +126,6 @@ class FileBase(object): def has_extension(self): """Returns True if self.extension is set, else False.""" if self.extension is None: - # TODO: do we actually want to have a seperate prop for no extension? - # self.set_property('no_extension', True) return False else: return True @@ -163,7 +161,7 @@ class FileBase(object): self._file_props['user_defined'][prop_string] = value def get_property(self, file_prop): - # FIXME: needs to be refactored + # TODO: could probably be refactored if file_prop in self._file_props: return self._file_props[file_prop] elif file_prop in self._file_props['user_defined']: @@ -187,7 +185,7 @@ class FileBase(object): if self.is_dangerous: return self.set_property('safety_category', 'dangerous') - # LOG: store reason string + # LOG: store reason string somewhere and do something with it path, filename = os.path.split(self.dst_path) self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename)) @@ -250,11 +248,12 @@ class FileBase(object): def write_log(self): """Print the logs related to the current file being processed.""" - tmp_log = self.logger.log.fields(**self._file_props) + file_log = self.logger.add_file(self) + file_log.fields(**self._file_props) class GroomerLogger(object): - """Groomer logging interface""" + """Groomer logging interface.""" def __init__(self, root_dir_path, debug=False): self.root_dir = root_dir_path @@ -301,8 +300,7 @@ class GroomerLogger(object): return s.hexdigest() def add_file(self, file): - # return a sublog for the file - pass + return self.log.name('file.src_path') class KittenGroomerBase(object): From ac94cf5d6d40673e43cde8ff716f402a400d4321 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Tue, 14 Mar 2017 19:53:23 -0400 Subject: [PATCH 34/36] Change the way test dst dirs are handled * Each test folder now copies files into its own test directory * Change gitignore due to dst dir changes * Make sure logger.tree is called for every directory --- .gitignore | 4 ++++ bin/filecheck.py | 5 +++-- tests/dst/.keepdir | 0 tests/test_filecheck.py | 25 +++++++++++++++++++------ 4 files changed, 26 insertions(+), 8 deletions(-) delete mode 100644 tests/dst/.keepdir diff --git a/.gitignore b/.gitignore index ecf6be3..3f07002 100644 --- a/.gitignore +++ b/.gitignore @@ -68,7 +68,11 @@ target/ # Project specific tests/dst/* +tests/*_dst tests/test_logs/* !tests/**/.keepdir !tests/src_invalid/* !tests/src_valid/* +pdfid.py +# Plugins are pdfid stuff +plugin_* diff --git a/bin/filecheck.py b/bin/filecheck.py index 91b494e..6513442 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -477,6 +477,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): def process_dir(self, src_dir, dst_dir): """Main function coordinating file processing.""" + self.logger.tree(src_dir) for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) # TODO: Can we clean up the way we handle relative_path? @@ -509,12 +510,12 @@ class KittenGroomerFileCheck(KittenGroomerBase): file.make_dangerous('Archive bomb') else: tempdir_path = file.make_tempdir() + # TODO: double check we are properly escaping file.src_path + # otherwise we are running unvalidated user input directly in the shell command_str = '{} -p1 x "{}" -o"{}" -bd -aoa' unpack_command = command_str.format(SEVENZ_PATH, file.src_path, tempdir_path) self._run_process(unpack_command) - # LOG: check that tree is working correctly here - self.logger.tree(tempdir_path) self.process_dir(tempdir_path, file.dst_path) self.safe_rmtree(tempdir_path) self.recursive_archive_depth -= 1 diff --git a/tests/dst/.keepdir b/tests/dst/.keepdir deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_filecheck.py b/tests/test_filecheck.py index f187523..f58152d 100644 --- a/tests/test_filecheck.py +++ b/tests/test_filecheck.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import os +import shutil import pytest @@ -20,25 +21,27 @@ skipif_nodeps = pytest.mark.skipif(NODEPS, class TestIntegration: @pytest.fixture - def src_valid(self): + def src_valid_path(self): return os.path.join(os.getcwd(), 'tests/src_valid') @pytest.fixture - def src_invalid(self): + def src_invalid_path(self): return os.path.join(os.getcwd(), 'tests/src_invalid') @pytest.fixture def dst(self): return os.path.join(os.getcwd(), 'tests/dst') - def test_filecheck(self, src_invalid, dst): - groomer = KittenGroomerFileCheck(src_invalid, dst, debug=True) + def test_filecheck_src_invalid(self, src_invalid_path): + dst_path = self.make_dst_dir_path(src_invalid_path) + groomer = KittenGroomerFileCheck(src_invalid_path, dst_path, debug=True) groomer.run() test_description = "filecheck_invalid" save_logs(groomer, test_description) - def test_filecheck_2(self, src_valid, dst): - groomer = KittenGroomerFileCheck(src_valid, dst, debug=True) + def test_filecheck_2(self, src_valid_path): + dst_path = self.make_dst_dir_path(src_valid_path) + groomer = KittenGroomerFileCheck(src_valid_path, dst_path, debug=True) groomer.run() test_description = "filecheck_valid" save_logs(groomer, test_description) @@ -46,8 +49,18 @@ class TestIntegration: def test_processdir(self): pass + def test_handle_archives(self): + pass + + def make_dst_dir_path(self, src_dir_path): + dst_path = src_dir_path + '_dst' + shutil.rmtree(dst_path, ignore_errors=True) + os.makedirs(dst_path, exist_ok=True) + return dst_path + class TestFileHandling: def test_autorun(self): # Run on a single autorun file, confirm that it gets flagged as dangerous + # TODO: build out these and other methods for individual file cases pass From 4d8a1d1dafd7ef453e5a8653215c7ab214edcad6 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 15 Mar 2017 22:29:51 -0400 Subject: [PATCH 35/36] Add/update docstrings for filecheck and helpers --- bin/filecheck.py | 64 ++++++++++++++++++++++++++-------------- kittengroomer/helpers.py | 62 +++++++++++++++++++++++--------------- 2 files changed, 80 insertions(+), 46 deletions(-) diff --git a/bin/filecheck.py b/bin/filecheck.py index 6513442..3dc26dd 100644 --- a/bin/filecheck.py +++ b/bin/filecheck.py @@ -23,6 +23,8 @@ SEVENZ_PATH = '/usr/bin/7z' class Config: + """Configuration information for Filecheck.""" + # Application subtypes (mimetype: 'application/') mimes_ooxml = ['vnd.openxmlformats-officedocument.'] mimes_office = ['msword', 'vnd.ms-'] @@ -180,12 +182,13 @@ class File(FileBase): @property def has_metadata(self): + """True if filetype typically contains metadata, else False.""" if self.mimetype in Config.mimes_metadata: return True return False def make_tempdir(self): - """Make a temporary directory.""" + """Make a temporary directory at self.tempdir_path.""" self.tempdir_path = self.dst_path + '_temp' if not os.path.exists(self.tempdir_path): os.makedirs(self.tempdir_path) @@ -246,7 +249,7 @@ class File(FileBase): self.force_ext('.txt') def application(self): - """Processes an application specific file according to its subtype.""" + """Process an application specific file according to its subtype.""" for subtype, method in self.app_subtype_methods.items(): if subtype in self.sub_type: # TODO: should we change the logic so we don't iterate through all of the subtype methods? @@ -258,13 +261,13 @@ class File(FileBase): self._unknown_app() def _executables(self): - """Processes an executable file.""" + """Process an executable file.""" # LOG: change the processing_type property to some other name or include in file_string self.set_property('processing_type', 'executable') self.make_dangerous('executable') def _winoffice(self): - """Processes a winoffice file using olefile/oletools.""" + """Process a winoffice file using olefile/oletools.""" # LOG: processing_type property self.set_property('processing_type', 'WinOffice') oid = oletools.oleid.OleID(self.src_path) # First assume a valid file @@ -297,7 +300,7 @@ class File(FileBase): self.make_dangerous('flash') def _ooxml(self): - """Processes an ooxml file.""" + """Process an ooxml file.""" # LOG: processing_type property self.set_property('processing_type', 'ooxml') try: @@ -318,7 +321,7 @@ class File(FileBase): self.make_dangerous('embedded pack') def _libreoffice(self): - """Processes a libreoffice file.""" + """Process a libreoffice file.""" self.set_property('processing_type', 'libreoffice') # As long as there is no way to do a sanity check on the files => dangerous try: @@ -333,7 +336,7 @@ class File(FileBase): self.make_dangerous('macro') def _pdf(self): - """Processes a PDF file.""" + """Process a PDF file.""" # LOG: processing_type property self.set_property('processing_type', 'pdf') xmlDoc = PDFiD(self.src_path) @@ -351,26 +354,30 @@ class File(FileBase): self.make_dangerous('launch') def _archive(self): - """Processes an archive using 7zip. The archive is extracted to a - temporary directory and self.process_dir is called on that directory. - The recursive archive depth is increased to protect against archive - bombs.""" + """ + Process an archive using 7zip. + + The archive is extracted to a temporary directory and self.process_dir + is called on that directory. The recursive archive depth is increased + to protect against archive bombs. + """ # LOG: change this to something archive specific self.set_property('processing_type', 'archive') self.should_copy = False self.is_recursive = True def _unknown_app(self): - """Processes an unknown file.""" + """Process an unknown file.""" self.make_unknown() def _binary_app(self): - """Processses an unknown binary file.""" + """Process an unknown binary file.""" self.make_binary() ####################### # Metadata extractors def _metadata_exif(self, metadata_file_path): + """Read exif metadata from a jpg or tiff file using exifread.""" # TODO: this method is kind of long, can we shorten it somehow? img = open(self.src_path, 'rb') tags = None @@ -401,6 +408,7 @@ class File(FileBase): return True def _metadata_png(self, metadata_file_path): + """Extract metadata from a png file using PIL/Pillow.""" warnings.simplefilter('error', Image.DecompressionBombWarning) try: img = Image.open(self.src_path) @@ -420,6 +428,7 @@ class File(FileBase): return False def extract_metadata(self): + """Create metadata file and call correct metadata extraction method.""" metadata_file_path = self.create_metadata_file(".metadata.txt") mt = self.mimetype metadata_processing_method = self.metadata_mimetype_methods.get(mt) @@ -430,12 +439,12 @@ class File(FileBase): ####################### # ##### Media - audio and video aren't converted ###### def audio(self): - """Processes an audio file.""" + """Process an audio file.""" self.log_string += 'Audio file' self._media_processing() def video(self): - """Processes a video.""" + """Process a video.""" self.log_string += 'Video file' self._media_processing() @@ -444,11 +453,14 @@ class File(FileBase): self.set_property('processing_type', 'media') def image(self): - """Processes an image. + """ + Process an image. - Extracts metadata to dest key if metadata is present. Creates a - temporary directory on dest key, opens the using PIL.Image,saves it to - the temporary directory, and copies it to the destination.""" + Extracts metadata to dest key using self.extract_metada() if metadata + is present. Creates a temporary directory on dest key, opens the image + using PIL.Image, saves it to the temporary directory, and copies it to + the destination. + """ # TODO: make sure this method works for png, gif, tiff if self.has_metadata: self.extract_metadata() @@ -476,7 +488,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.max_recursive_depth = max_recursive_depth def process_dir(self, src_dir, dst_dir): - """Main function coordinating file processing.""" + """Process a directory on the source key.""" self.logger.tree(src_dir) for srcpath in self.list_all_files(src_dir): dstpath = srcpath.replace(src_dir, dst_dir) @@ -489,6 +501,12 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.process_file(self.cur_file) def process_file(self, file): + """ + Process an individual file. + + Check the file, handle archives using self.process_archive, copy + the file to the destionation key, and clean up temporary directory. + """ file.check() if file.is_recursive: self.process_archive(file) @@ -500,10 +518,12 @@ class KittenGroomerFileCheck(KittenGroomerBase): self.safe_rmtree(file.tempdir_path) def process_archive(self, file): - """Unpacks an archive using 7zip and processes contents. + """ + Unpack an archive using 7zip and process contents using process_dir. Should be given a Kittengroomer file object whose src_path points - to an archive.""" + to an archive. + """ self.recursive_archive_depth += 1 # LOG: write_log or somehow log the archive file here if self.recursive_archive_depth >= self.max_recursive_depth: diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 465ad4a..318f1ca 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -32,13 +32,18 @@ class ImplementationRequired(KittenGroomerError): class FileBase(object): """ - Base object for individual files in the source directory. Contains file - attributes and various helper methods. Subclass and add attributes - or methods relevant to a given implementation. + Base object for individual files in the source directory. + + Contains file attributes and various helper methods. """ def __init__(self, src_path, dst_path, logger=None): - """Initialized with the source path and expected destination path.""" + """ + Initialized with the source path and expected destination path. + + self.logger should be a logging object with an add_file method. + Create various properties and determine the file's mimetype. + """ self.src_path = src_path self.dst_path = dst_path self.filename = os.path.basename(self.src_path) @@ -106,6 +111,7 @@ class FileBase(object): @property def size(self): + """Filesize in bytes as an int, 0 if file does not exist.""" try: size = os.path.getsize(self.src_path) except FileNotFoundError: @@ -114,7 +120,7 @@ class FileBase(object): @property def has_mimetype(self): - """Returns True if file has a full mimetype, else False.""" + """True if file has a main and sub mimetype, else False.""" # TODO: broken mimetype checks should be done somewhere else. # Should the check be by default or should we let the API consumer write it? if not self.main_type or not self.sub_type: @@ -124,7 +130,7 @@ class FileBase(object): @property def has_extension(self): - """Returns True if self.extension is set, else False.""" + """True if self.extension is set, else False.""" if self.extension is None: return False else: @@ -132,35 +138,42 @@ class FileBase(object): @property def is_dangerous(self): - """True if file has been marked 'dangerous' else False.""" + """True if file has been marked 'dangerous', else False.""" return self._file_props['safety_category'] is 'dangerous' @property def is_unknown(self): - """True if file has been marked 'unknown' else False.""" + """True if file has been marked 'unknown', else False.""" return self._file_props['safety_category'] is 'unknown' @property def is_binary(self): - """True if file has been marked 'binary' else False.""" + """True if file has been marked 'binary', else False.""" return self._file_props['safety_category'] is 'binary' @property def is_symlink(self): - """Returns True and updates log if file is a symlink.""" + """True if file is a symlink, else False.""" if self._file_props['symlink'] is False: return False else: return True def set_property(self, prop_string, value): - """Takes a property + a value and adds them to self._file_props.""" + """ + Take a property and a value and add them to self._file_props. + + If prop_string is already in _file_props, set prop_string to value. + If prop_string not in _file_props, set prop_string to value in + _file_props['user_defined']. + """ if prop_string in self._file_props.keys(): self._file_props[prop_string] = value else: self._file_props['user_defined'][prop_string] = value def get_property(self, file_prop): + """Get the value for a property in _file_props.""" # TODO: could probably be refactored if file_prop in self._file_props: return self._file_props[file_prop] @@ -170,16 +183,18 @@ class FileBase(object): return None def add_error(self, error, info): + """Add an error: info pair to _file_props['errors'].""" self._file_props['errors'].update({error: info}) def add_file_string(self, file_string): + """Add a file descriptor string to _file_props.""" self._file_props['file_string_set'].add(file_string) def make_dangerous(self, reason_string=None): """ - Marks a file as dangerous. + Mark file as dangerous. - Prepends and appends DANGEROUS to the destination file name + Prepend and append DANGEROUS to the destination file name to help prevent double-click of death. """ if self.is_dangerous: @@ -190,7 +205,7 @@ class FileBase(object): self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename)) def make_unknown(self): - """Marks a file as an unknown type and prepends UNKNOWN to filename.""" + """Mark file as an unknown type and prepend UNKNOWN to filename.""" if self.is_dangerous or self.is_binary: return self.set_property('safety_category', 'unknown') @@ -198,7 +213,7 @@ class FileBase(object): self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename)) def make_binary(self): - """Marks a file as a binary and appends .bin to filename.""" + """Mark file as a binary and append .bin to filename.""" if self.is_dangerous: return self.set_property('safety_category', 'binary') @@ -206,7 +221,7 @@ class FileBase(object): self.dst_path = os.path.join(path, '{}.bin'.format(filename)) def safe_copy(self, src=None, dst=None): - """Copy a file and create directory if needed.""" + """Copy file and create destination directories if needed.""" if src is None: src = self.src_path if dst is None: @@ -220,7 +235,7 @@ class FileBase(object): self.add_error(e, '') def force_ext(self, ext): - """If dst_path does not end in ext, changes it and edits _file_props.""" + """If dst_path does not end in ext, change it and edit _file_props.""" if not self.dst_path.endswith(ext): self.set_property('force_ext', True) self.dst_path += ext @@ -228,7 +243,7 @@ class FileBase(object): self.set_property('extension', ext) def create_metadata_file(self, ext): - """Create a separate file to hold this file's metadata.""" + """Create a separate file to hold metadata from this file.""" try: # make sure we aren't overwriting anything if os.path.exists(self.src_path + ext): @@ -247,7 +262,7 @@ class FileBase(object): return False def write_log(self): - """Print the logs related to the current file being processed.""" + """Write logs from file to self.logger.""" file_log = self.logger.add_file(self) file_log.fields(**self._file_props) @@ -273,7 +288,7 @@ class GroomerLogger(object): self.log_debug_out = os.devnull def tree(self, base_dir, padding=' '): - """Writes a graphical tree to the log for a given directory.""" + """Write a graphical tree to the log for `base_dir`.""" with open(self.log_content, 'ab') as lf: lf.write(bytes('#' * 80 + '\n', 'UTF-8')) lf.write(bytes('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)).encode()), 'utf8')) @@ -289,7 +304,7 @@ class GroomerLogger(object): lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)).encode(errors='ignore')) def _computehash(self, path): - """Returns a sha256 hash of a file at a given path.""" + """Return a sha256 hash of a file at a given path.""" s = hashlib.sha256() with open(path, 'rb') as f: while True: @@ -300,6 +315,7 @@ class GroomerLogger(object): return s.hexdigest() def add_file(self, file): + """Add a file to the log.""" return self.log.name('file.src_path') @@ -340,9 +356,7 @@ class KittenGroomerBase(object): # TODO: feels like this function doesn't need to exist if we move main() def processdir(self, src_dir, dst_dir): - """ - Implement this function in your subclass to define file processing behavior. - """ + """Implement this function to define file processing behavior.""" raise ImplementationRequired('Please implement processdir.') From 1abfb432b1170051c47468bb2c0b3209f7a17ce2 Mon Sep 17 00:00:00 2001 From: Dan Puttick Date: Wed, 15 Mar 2017 22:53:14 -0400 Subject: [PATCH 36/36] Edit README.md * Mention that example files are not up to date with the new API changes * Update example code for API changes --- README.md | 98 +++++++++++++++++++++++-------------------------------- setup.py | 2 +- 2 files changed, 42 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index f0a0cf3..0f368ab 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ PyCIRCLean is the core Python code used by [CIRCLean](https://github.com/CIRCL/Circlean/), an open-source USB key and document sanitizer created by [CIRCL](https://www.circl.lu/). This module has been separated from the device-specific scripts and can be used for dedicated security applications to sanitize documents from hostile environments -to trusted environments. PyCIRCLean is currently Python 3.3+ only. +to trusted environments. PyCIRCLean is currently Python 3.3+ compatible. # Installation @@ -23,10 +23,12 @@ pip install . # How to use PyCIRCLean PyCIRCLean is a simple Python library to handle file checking and sanitization. -PyCIRCLean is designed to be overloaded and extended to cover specific checking +PyCIRCLean is designed to be extended to cover specific checking and sanitization workflows in different organizations such as industrial environments or restricted/classified ICT environments. A series of practical examples utilizing PyCIRCLean can be found -in the [./examples](./examples) directory. +in the [./examples](./examples) directory. Note: for commits beyond version 2.2.0 these +examples are not guaranteed to work with the PyCIRCLean API. Please check [helpers.py](./kittengroomer/helpers.py) or +[filecheck.py](./bin/filecheck.py) to see the new API interface. The following simple example using PyCIRCLean will only copy files with a .conf extension matching the 'text/plain' MIME type. If any other file is found in the source directory, the files won't be copied to the destination directory. @@ -41,87 +43,69 @@ from kittengroomer import FileBase, KittenGroomerBase, main # Extension -configfiles = {'.conf': 'text/plain'} +class Config: + configfiles = {'.conf': 'text/plain'} class FileSpec(FileBase): def __init__(self, src_path, dst_path): - ''' Init file object, set the extension ''' + """Init file object, set the extension.""" super(FileSpec, self).__init__(src_path, dst_path) + self.valid_files = {} a, self.extension = os.path.splitext(self.src_path) self.mimetype = magic.from_file(self.src_path, mime=True).decode("utf-8") + # The initial version will only accept the file extensions/mimetypes listed here. + self.valid_files.update(Config.configfiles) + + def check(self): + valid = True + expected_mime = self.valid_files.get(self.extension) + if expected_mime is None: + # Unexpected extension => disallowed + valid = False + compare_ext = 'Extension: {} - Expected: {}'.format(self.cur_file.extension, ', '.join(self.valid_files.keys())) + elif self.mimetype != expected_mime: + # Unexpected mimetype => disallowed + valid = False + compare_mime = 'Mime: {} - Expected: {}'.format(self.cur_file.mimetype, expected_mime) + self.add_log_details('valid', valid) + if valid: + self.cur_file.log_string = 'Extension: {} - MimeType: {}'.format(self.cur_file.extension, self.cur_file.mimetype) + else: + self.should_copy = False + if compare_ext is not None: + self.add_log_string(compare_ext) + else: + self.add_log_string(compare_mime) + if self.should_copy: + self.safe_copy() + self.write_log() class KittenGroomerSpec(KittenGroomerBase): def __init__(self, root_src=None, root_dst=None): - ''' - Initialize the basics of the copy - ''' + """Initialize the basics of the copy.""" if root_src is None: root_src = os.path.join(os.sep, 'media', 'src') if root_dst is None: root_dst = os.path.join(os.sep, 'media', 'dst') super(KittenGroomerSpec, self).__init__(root_src, root_dst) - self.valid_files = {} - - # The initial version will only accept the file extensions/mimetypes listed here. - self.valid_files.update(configfiles) - - def _print_log(self): - ''' - Print the logs related to the current file being processed - ''' - tmp_log = self.log_name.fields(**self.cur_file.log_details) - if not self.cur_file.log_details.get('valid'): - tmp_log.warning(self.cur_file.log_string) - else: - tmp_log.debug(self.cur_file.log_string) def processdir(self): - ''' - Main function doing the processing - ''' + """Main function doing the processing.""" to_copy = [] error = [] for srcpath in self._list_all_files(self.src_root_dir): - valid = True - self.log_name.info('Processing {}', srcpath.replace(self.src_root_dir + '/', '')) - self.cur_file = FileSpec(srcpath, srcpath.replace(self.src_root_dir, self.dst_root_dir)) - expected_mime = self.valid_files.get(self.cur_file.extension) - if expected_mime is None: - # Unexpected extension => disallowed - valid = False - compare_ext = 'Extension: {} - Expected: {}'.format(self.cur_file.extension, ', '.join(self.valid_files.keys())) - elif self.cur_file.mimetype != expected_mime: - # Unexpected mimetype => disallowed - valid = False - compare_mime = 'Mime: {} - Expected: {}'.format(self.cur_file.mimetype, expected_mime) - self.cur_file.add_log_details('valid', valid) - if valid: - to_copy.append(self.cur_file) - self.cur_file.log_string = 'Extension: {} - MimeType: {}'.format(self.cur_file.extension, self.cur_file.mimetype) - else: - error.append(self.cur_file) - if compare_ext is not None: - self.cur_file.log_string = compare_ext - else: - self.cur_file.log_string = compare_mime - if len(error) > 0: - for f in error + to_copy: - self.cur_file = f - self._print_log() - else: - for f in to_copy: - self.cur_file = f - self._safe_copy() - self._print_log() + dstpath = srcpath.replace(self.src_root_dir, self.dst_root_dir) + cur_file = FileSpec(srcpath, dstpath) + cur_file.check() if __name__ == '__main__': main(KittenGroomerSpec, ' Only copy some files, returns an error is anything else is found') - exit(0) + ~~~ # How to contribute diff --git a/setup.py b/setup.py index c11f64d..4397c5f 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import setup setup( name='kittengroomer', - version='2.1', + version='2.1.0', author='Raphaël Vinot', author_email='raphael.vinot@circl.lu', maintainer='Raphaël Vinot',