2015-05-11 14:32:59 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
2016-12-06 03:02:46 +01:00
|
|
|
|
|
|
|
"""
|
|
|
|
Contains the base objects for use when creating a sanitizer using
|
2017-04-10 13:18:27 +02:00
|
|
|
PyCIRCLean. Subclass or import from FileBase/KittenGroomerBase and implement
|
|
|
|
your desired behavior.
|
2016-12-06 03:02:46 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
2015-05-11 14:32:59 +02:00
|
|
|
import os
|
2015-11-23 19:54:29 +01:00
|
|
|
import hashlib
|
2015-05-11 14:32:59 +02:00
|
|
|
import shutil
|
|
|
|
import argparse
|
2017-08-05 00:02:31 +02:00
|
|
|
import stat
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2016-12-01 03:04:59 +01:00
|
|
|
import magic
|
|
|
|
|
2015-05-11 14:32:59 +02:00
|
|
|
|
|
|
|
class FileBase(object):
|
2016-12-06 03:02:46 +01:00
|
|
|
"""
|
2017-03-16 03:29:51 +01:00
|
|
|
Base object for individual files in the source directory.
|
|
|
|
|
|
|
|
Contains file attributes and various helper methods.
|
2016-12-06 03:02:46 +01:00
|
|
|
"""
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2017-03-18 03:19:56 +01:00
|
|
|
def __init__(self, src_path, dst_path):
|
2017-03-16 03:29:51 +01:00
|
|
|
"""
|
|
|
|
Initialized with the source path and expected destination path.
|
|
|
|
|
|
|
|
Create various properties and determine the file's mimetype.
|
|
|
|
"""
|
2015-05-11 14:32:59 +02:00
|
|
|
self.src_path = src_path
|
2017-07-12 23:58:39 +02:00
|
|
|
self.dst_dir = os.path.dirname(dst_path)
|
|
|
|
self.filename = os.path.basename(src_path)
|
|
|
|
self.size = self._get_size(src_path)
|
2017-07-17 20:52:22 +02:00
|
|
|
self.is_dangerous = False
|
2017-07-12 23:58:39 +02:00
|
|
|
self.copied = False
|
|
|
|
self.symlink_path = None
|
2017-07-20 19:17:45 +02:00
|
|
|
self._description_string = [] # array of descriptions to be joined
|
2017-07-12 23:58:39 +02:00
|
|
|
self._errors = {}
|
|
|
|
self._user_defined = {}
|
2017-03-09 05:06:20 +01:00
|
|
|
self.should_copy = True
|
2017-07-12 23:58:39 +02:00
|
|
|
self.mimetype = self._determine_mimetype(src_path)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def dst_path(self):
|
|
|
|
return os.path.join(self.dst_dir, self.filename)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def extension(self):
|
|
|
|
_, ext = os.path.splitext(self.filename)
|
2017-03-06 20:55:33 +01:00
|
|
|
if ext == '':
|
2017-07-17 16:10:21 +02:00
|
|
|
return None
|
|
|
|
else:
|
|
|
|
return ext.lower()
|
2016-12-22 16:12:13 +01:00
|
|
|
|
2017-07-12 23:58:39 +02:00
|
|
|
@property
|
|
|
|
def maintype(self):
|
|
|
|
main, _ = self._split_mimetype(self.mimetype)
|
|
|
|
return main
|
2015-11-05 14:43:54 +01:00
|
|
|
|
2017-03-09 03:30:06 +01:00
|
|
|
@property
|
2017-07-12 23:58:39 +02:00
|
|
|
def subtype(self):
|
|
|
|
_, sub = self._split_mimetype(self.mimetype)
|
|
|
|
return sub
|
2017-03-09 03:30:06 +01:00
|
|
|
|
2017-03-10 19:13:38 +01:00
|
|
|
@property
|
2015-11-05 14:43:54 +01:00
|
|
|
def has_mimetype(self):
|
2017-03-16 03:29:51 +01:00
|
|
|
"""True if file has a main and sub mimetype, else False."""
|
2017-07-12 23:58:39 +02:00
|
|
|
if not self.maintype or not self.subtype:
|
2015-11-05 14:43:54 +01:00
|
|
|
return False
|
2017-03-06 21:02:29 +01:00
|
|
|
else:
|
|
|
|
return True
|
2015-11-05 14:43:54 +01:00
|
|
|
|
2017-03-10 19:13:38 +01:00
|
|
|
@property
|
2015-11-05 14:43:54 +01:00
|
|
|
def has_extension(self):
|
2017-03-16 03:29:51 +01:00
|
|
|
"""True if self.extension is set, else False."""
|
2017-03-06 21:02:29 +01:00
|
|
|
if self.extension is None:
|
2015-11-05 14:43:54 +01:00
|
|
|
return False
|
2017-03-06 21:02:29 +01:00
|
|
|
else:
|
|
|
|
return True
|
2015-11-05 14:43:54 +01:00
|
|
|
|
2017-03-10 19:13:38 +01:00
|
|
|
@property
|
2015-11-24 17:45:06 +01:00
|
|
|
def is_symlink(self):
|
2017-07-11 20:31:24 +02:00
|
|
|
"""True if file is a symlink, else False."""
|
2017-07-12 23:58:39 +02:00
|
|
|
if self.symlink_path is None:
|
2017-03-06 21:02:29 +01:00
|
|
|
return False
|
2017-03-09 03:30:06 +01:00
|
|
|
else:
|
|
|
|
return True
|
2017-03-06 21:02:29 +01:00
|
|
|
|
2017-07-14 22:41:44 +02:00
|
|
|
@property
|
|
|
|
def description_string(self):
|
2017-07-20 19:17:45 +02:00
|
|
|
if len(self._description_string) == 0:
|
2017-07-27 23:05:00 +02:00
|
|
|
return 'No description'
|
2017-07-20 19:17:45 +02:00
|
|
|
elif len(self._description_string) == 1:
|
|
|
|
return self._description_string[0]
|
|
|
|
else:
|
|
|
|
ret_string = ', '.join(self._description_string)
|
|
|
|
return ret_string.strip(', ')
|
2017-07-14 22:41:44 +02:00
|
|
|
|
|
|
|
@description_string.setter
|
|
|
|
def description_string(self, value):
|
|
|
|
if hasattr(self, 'description_string'):
|
|
|
|
if isinstance(value, str):
|
2017-07-20 19:17:45 +02:00
|
|
|
if value not in self._description_string:
|
|
|
|
self._description_string.append(value)
|
2017-07-14 22:41:44 +02:00
|
|
|
else:
|
|
|
|
raise TypeError("Description_string can only include strings")
|
|
|
|
else:
|
2017-07-20 19:17:45 +02:00
|
|
|
self._description_string = value
|
2017-07-14 22:41:44 +02:00
|
|
|
|
2017-03-06 21:02:29 +01:00
|
|
|
def set_property(self, prop_string, value):
|
2017-03-16 03:29:51 +01:00
|
|
|
"""
|
2017-07-12 23:58:39 +02:00
|
|
|
Take a property and a value and add them to the file's stored props.
|
2017-03-16 03:29:51 +01:00
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
If `prop_string` is part of the file property API, set it to `value`.
|
|
|
|
Otherwise, add `prop_string`: `value` to `user_defined` properties.
|
2017-07-12 23:58:39 +02:00
|
|
|
TODO: rewrite docstring
|
2017-03-16 03:29:51 +01:00
|
|
|
"""
|
2017-07-12 23:58:39 +02:00
|
|
|
if hasattr(self, prop_string):
|
|
|
|
setattr(self, prop_string, value)
|
2017-03-09 03:30:06 +01:00
|
|
|
else:
|
2017-07-14 22:41:44 +02:00
|
|
|
self._user_defined[prop_string] = value
|
2015-11-24 17:45:06 +01:00
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def get_property(self, prop_string):
|
|
|
|
"""
|
|
|
|
Get the value for a property stored on the file.
|
|
|
|
|
|
|
|
Returns `None` if `prop_string` cannot be found on the file.
|
|
|
|
"""
|
2017-07-12 23:58:39 +02:00
|
|
|
try:
|
|
|
|
return getattr(self, prop_string)
|
|
|
|
except AttributeError:
|
|
|
|
return self._user_defined.get(prop_string, None)
|
2017-03-09 03:30:06 +01:00
|
|
|
|
2017-03-18 05:10:17 +01:00
|
|
|
def get_all_props(self):
|
|
|
|
"""Return a dict containing all stored properties of this file."""
|
2017-07-12 23:58:39 +02:00
|
|
|
# Maybe move this onto the logger? I think that makes more sense
|
|
|
|
props_dict = {
|
|
|
|
'filepath': self.src_path,
|
|
|
|
'filename': self.filename,
|
|
|
|
'file_size': self.size,
|
|
|
|
'mimetype': self.mimetype,
|
|
|
|
'maintype': self.maintype,
|
|
|
|
'subtype': self.subtype,
|
|
|
|
'extension': self.extension,
|
|
|
|
'is_dangerous': self.is_dangerous,
|
2017-07-20 19:17:45 +02:00
|
|
|
'is_symlink': self.is_symlink,
|
2017-07-12 23:58:39 +02:00
|
|
|
'symlink_path': self.symlink_path,
|
|
|
|
'copied': self.copied,
|
|
|
|
'description_string': self.description_string,
|
|
|
|
'errors': self._errors,
|
|
|
|
'user_defined': self._user_defined
|
|
|
|
}
|
|
|
|
return props_dict
|
2017-03-18 05:10:17 +01:00
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def add_error(self, error, info_string):
|
|
|
|
"""Add an `error`: `info_string` pair to the file."""
|
2017-07-12 23:58:39 +02:00
|
|
|
self._errors.update({error: info_string})
|
2017-03-09 03:30:06 +01:00
|
|
|
|
2017-03-22 15:28:00 +01:00
|
|
|
def add_description(self, description_string):
|
|
|
|
"""
|
|
|
|
Add a description string to the file.
|
|
|
|
|
|
|
|
If `description_string` is already present, will prevent duplicates.
|
|
|
|
"""
|
|
|
|
self.set_property('description_string', description_string)
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2017-03-09 03:30:06 +01:00
|
|
|
def make_dangerous(self, reason_string=None):
|
2016-12-06 03:02:46 +01:00
|
|
|
"""
|
2017-03-16 03:29:51 +01:00
|
|
|
Mark file as dangerous.
|
2016-12-06 03:02:46 +01:00
|
|
|
|
2017-03-16 03:29:51 +01:00
|
|
|
Prepend and append DANGEROUS to the destination file name
|
2016-12-22 00:04:59 +01:00
|
|
|
to help prevent double-click of death.
|
2016-12-06 03:02:46 +01:00
|
|
|
"""
|
2017-07-12 23:58:39 +02:00
|
|
|
if not self.is_dangerous:
|
2017-07-14 23:37:30 +02:00
|
|
|
self.set_property('is_dangerous', True)
|
2017-07-13 18:55:26 +02:00
|
|
|
self.filename = 'DANGEROUS_{}_DANGEROUS'.format(self.filename)
|
|
|
|
if reason_string:
|
|
|
|
self.add_description(reason_string)
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2017-03-09 19:48:07 +01:00
|
|
|
def safe_copy(self, src=None, dst=None):
|
2017-08-05 00:02:31 +02:00
|
|
|
"""
|
|
|
|
Copy file and create destination directories if needed.
|
|
|
|
|
|
|
|
Sets all exec bits to '0'.
|
|
|
|
"""
|
2017-03-09 19:48:07 +01:00
|
|
|
if src is None:
|
|
|
|
src = self.src_path
|
|
|
|
if dst is None:
|
|
|
|
dst = self.dst_path
|
|
|
|
try:
|
2017-07-17 20:52:22 +02:00
|
|
|
os.makedirs(self.dst_dir, exist_ok=True)
|
2017-03-09 19:48:07 +01:00
|
|
|
shutil.copy(src, dst)
|
2017-08-05 00:02:31 +02:00
|
|
|
current_perms = self._get_file_permissions(dst)
|
|
|
|
only_exec_bits = 0o0111
|
|
|
|
perms_no_exec = current_perms & (~only_exec_bits)
|
|
|
|
os.chmod(dst, perms_no_exec)
|
2017-07-17 20:52:22 +02:00
|
|
|
except IOError as e:
|
|
|
|
# Probably means we can't write in the dest dir
|
2017-03-09 19:48:07 +01:00
|
|
|
self.add_error(e, '')
|
|
|
|
|
2017-07-12 23:58:39 +02:00
|
|
|
def force_ext(self, extension):
|
|
|
|
"""If dst_path does not end in `extension`, append .ext to it."""
|
|
|
|
new_ext = self._check_leading_dot(extension)
|
|
|
|
if not self.filename.endswith(new_ext):
|
2017-07-17 20:52:22 +02:00
|
|
|
# TODO: log that the extension was changed
|
2017-07-12 23:58:39 +02:00
|
|
|
self.filename += new_ext
|
|
|
|
if not self.get_property('extension') == new_ext:
|
|
|
|
self.set_property('extension', new_ext)
|
|
|
|
|
|
|
|
def create_metadata_file(self, extension):
|
|
|
|
# TODO: this method name is confusing
|
2017-03-21 00:39:37 +01:00
|
|
|
"""
|
|
|
|
Create a separate file to hold extracted metadata.
|
|
|
|
|
2017-07-12 23:58:39 +02:00
|
|
|
The string `extension` will be used as the extension for the file.
|
2017-03-21 00:39:37 +01:00
|
|
|
"""
|
2017-07-12 23:58:39 +02:00
|
|
|
ext = self._check_leading_dot(extension)
|
2017-02-16 23:27:00 +01:00
|
|
|
try:
|
2017-07-12 23:58:39 +02:00
|
|
|
# Prevent using the same path as another file from src_path
|
2017-02-16 23:27:00 +01:00
|
|
|
if os.path.exists(self.src_path + ext):
|
2017-07-12 23:58:39 +02:00
|
|
|
raise KittenGroomerError(
|
|
|
|
"Could not create metadata file for \"" +
|
|
|
|
self.filename +
|
|
|
|
"\": a file with that path exists.")
|
2017-02-16 23:27:00 +01:00
|
|
|
else:
|
2017-07-17 20:52:22 +02:00
|
|
|
os.makedirs(self.dst_dir, exist_ok=True)
|
2017-07-12 23:58:39 +02:00
|
|
|
# TODO: shouldn't mutate state and also return something
|
2017-02-16 23:27:00 +01:00
|
|
|
self.metadata_file_path = self.dst_path + ext
|
|
|
|
return self.metadata_file_path
|
2017-07-12 23:58:39 +02:00
|
|
|
# TODO: can probably let this exception bubble up
|
2017-02-16 23:27:00 +01:00
|
|
|
except KittenGroomerError as e:
|
2017-03-09 03:30:06 +01:00
|
|
|
self.add_error(e, '')
|
2017-02-16 23:27:00 +01:00
|
|
|
return False
|
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def _check_leading_dot(self, ext):
|
2017-07-12 23:58:39 +02:00
|
|
|
# TODO: this method name is confusing
|
2017-03-21 00:39:37 +01:00
|
|
|
if len(ext) > 0:
|
|
|
|
if not ext.startswith('.'):
|
|
|
|
return '.' + ext
|
|
|
|
return ext
|
|
|
|
|
2017-07-12 23:58:39 +02:00
|
|
|
def _determine_mimetype(self, file_path):
|
|
|
|
if os.path.islink(file_path):
|
|
|
|
# libmagic will throw an IOError on a broken symlink
|
|
|
|
mimetype = 'inode/symlink'
|
2017-07-13 23:36:43 +02:00
|
|
|
self.set_property('symlink_path', os.readlink(file_path))
|
2017-07-12 23:58:39 +02:00
|
|
|
else:
|
|
|
|
try:
|
2017-07-20 21:40:49 +02:00
|
|
|
mt = magic.from_file(file_path, mime=True)
|
2017-08-07 18:09:22 +02:00
|
|
|
# libmagic always returns something, even if it's just 'data'
|
2017-07-12 23:58:39 +02:00
|
|
|
except UnicodeEncodeError as e:
|
|
|
|
self.add_error(e, '')
|
|
|
|
mt = None
|
|
|
|
try:
|
|
|
|
mimetype = mt.decode("utf-8")
|
|
|
|
except:
|
2017-08-07 18:09:22 +02:00
|
|
|
# FIXME: what should the exception be if mimetype isn't utf-8?
|
2017-07-12 23:58:39 +02:00
|
|
|
mimetype = mt
|
|
|
|
return mimetype
|
|
|
|
|
|
|
|
def _split_mimetype(self, mimetype):
|
2017-07-16 20:25:16 +02:00
|
|
|
if mimetype and '/' in mimetype:
|
2017-07-12 23:58:39 +02:00
|
|
|
main_type, sub_type = mimetype.split('/')
|
|
|
|
else:
|
|
|
|
main_type, sub_type = None, None
|
|
|
|
return main_type, sub_type
|
|
|
|
|
|
|
|
def _get_size(self, file_path):
|
|
|
|
"""Filesize in bytes as an int, 0 if file does not exist."""
|
|
|
|
try:
|
|
|
|
size = os.path.getsize(file_path)
|
|
|
|
except FileNotFoundError:
|
|
|
|
size = 0
|
|
|
|
return size
|
|
|
|
|
2017-08-05 00:02:31 +02:00
|
|
|
def _remove_exec_bit(self, file_path):
|
|
|
|
current_perms = self._get_file_permissions(file_path)
|
|
|
|
perms_no_exec = current_perms & (~stat.S_IEXEC)
|
|
|
|
os.chmod(file_path, perms_no_exec)
|
|
|
|
|
|
|
|
def _get_file_permissions(self, file_path):
|
|
|
|
full_mode = os.stat(file_path, follow_symlinks=False).st_mode
|
|
|
|
return stat.S_IMODE(full_mode)
|
|
|
|
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2017-04-10 13:07:21 +02:00
|
|
|
class Logging(object):
|
2017-02-21 01:03:11 +01:00
|
|
|
|
2017-04-10 13:07:21 +02:00
|
|
|
@staticmethod
|
|
|
|
def computehash(path):
|
|
|
|
"""Return the sha256 hash of a file at a given path."""
|
2017-02-21 01:03:11 +01:00
|
|
|
s = hashlib.sha256()
|
|
|
|
with open(path, 'rb') as f:
|
|
|
|
while True:
|
|
|
|
buf = f.read(0x100000)
|
|
|
|
if not buf:
|
|
|
|
break
|
|
|
|
s.update(buf)
|
|
|
|
return s.hexdigest()
|
|
|
|
|
|
|
|
|
|
|
|
class KittenGroomerBase(object):
|
|
|
|
"""Base object responsible for copy/sanitization process."""
|
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def __init__(self, src_root_path, dst_root_path):
|
2017-02-21 01:03:11 +01:00
|
|
|
"""Initialized with path to source and dest directories."""
|
2017-04-10 13:19:30 +02:00
|
|
|
self.src_root_path = os.path.abspath(src_root_path)
|
|
|
|
self.dst_root_path = os.path.abspath(dst_root_path)
|
2017-02-21 01:03:11 +01:00
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def safe_rmtree(self, directory_path):
|
2016-12-06 18:43:28 +01:00
|
|
|
"""Remove a directory tree if it exists."""
|
2017-03-21 00:39:37 +01:00
|
|
|
if os.path.exists(directory_path):
|
|
|
|
shutil.rmtree(directory_path)
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def safe_remove(self, file_path):
|
|
|
|
"""Remove file at file_path if it exists."""
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
os.remove(file_path)
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def safe_mkdir(self, directory_path):
|
2016-12-06 18:43:28 +01:00
|
|
|
"""Make a directory if it does not exist."""
|
2017-03-21 00:39:37 +01:00
|
|
|
if not os.path.exists(directory_path):
|
|
|
|
os.makedirs(directory_path)
|
2015-05-11 14:32:59 +02:00
|
|
|
|
2017-03-21 00:39:37 +01:00
|
|
|
def list_all_files(self, directory_path):
|
2017-02-16 23:27:00 +01:00
|
|
|
"""Generator yielding path to all of the files in a directory tree."""
|
2017-03-21 00:39:37 +01:00
|
|
|
for root, dirs, files in os.walk(directory_path):
|
2015-05-11 14:32:59 +02:00
|
|
|
for filename in files:
|
|
|
|
filepath = os.path.join(root, filename)
|
|
|
|
yield filepath
|
|
|
|
|
|
|
|
#######################
|
|
|
|
|
2017-02-21 01:03:11 +01:00
|
|
|
def processdir(self, src_dir, dst_dir):
|
2017-03-16 03:29:51 +01:00
|
|
|
"""Implement this function to define file processing behavior."""
|
2016-12-06 18:43:28 +01:00
|
|
|
raise ImplementationRequired('Please implement processdir.')
|
2015-05-11 14:32:59 +02:00
|
|
|
|
|
|
|
|
2017-07-11 20:45:12 +02:00
|
|
|
class KittenGroomerError(Exception):
|
|
|
|
"""Base KittenGroomer exception handler."""
|
|
|
|
|
|
|
|
def __init__(self, message):
|
|
|
|
super(KittenGroomerError, self).__init__(message)
|
|
|
|
self.message = message
|
|
|
|
|
|
|
|
|
|
|
|
class ImplementationRequired(KittenGroomerError):
|
|
|
|
"""Implementation required error."""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2017-08-07 18:09:22 +02:00
|
|
|
def main(
|
|
|
|
kg_implementation,
|
|
|
|
description=("Call a KittenGroomer implementation to process files "
|
|
|
|
"present in the source directory and copy them to the "
|
|
|
|
"destination directory.")):
|
|
|
|
print(description)
|
2015-05-11 14:32:59 +02:00
|
|
|
parser = argparse.ArgumentParser(prog='KittenGroomer', description=description)
|
|
|
|
parser.add_argument('-s', '--source', type=str, help='Source directory')
|
|
|
|
parser.add_argument('-d', '--destination', type=str, help='Destination directory')
|
|
|
|
args = parser.parse_args()
|
|
|
|
kg = kg_implementation(args.source, args.destination)
|
|
|
|
kg.processdir()
|