mirror of https://github.com/CIRCL/PyCIRCLean
310 lines
11 KiB
Python
310 lines
11 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Contains the base objects for use when creating a sanitizer using
|
|
PyCIRCLean. Subclass FileBase and KittenGroomerBase to implement your
|
|
desired behavior.
|
|
"""
|
|
|
|
|
|
import os
|
|
import sys
|
|
import hashlib
|
|
import shutil
|
|
import argparse
|
|
|
|
import magic
|
|
from twiggy import quick_setup, log
|
|
|
|
|
|
class KittenGroomerError(Exception):
|
|
"""Base KittenGroomer exception handler."""
|
|
|
|
def __init__(self, message):
|
|
super(KittenGroomerError, self).__init__(message)
|
|
self.message = message
|
|
|
|
|
|
class ImplementationRequired(KittenGroomerError):
|
|
"""Implementation required error."""
|
|
|
|
pass
|
|
|
|
|
|
class FileBase(object):
|
|
"""
|
|
Base object for individual files in the source directory. Contains file
|
|
attributes and various helper methods. Subclass and add attributes
|
|
or methods relevant to a given implementation.
|
|
"""
|
|
|
|
def __init__(self, src_path, dst_path):
|
|
"""Initialized with the source path and expected destination path."""
|
|
self.src_path = src_path
|
|
self.dst_path = dst_path
|
|
self.log_details = {'filepath': self.src_path}
|
|
self.log_string = ''
|
|
_, self.extension = os.path.splitext(self.src_path)
|
|
self._determine_mimetype()
|
|
|
|
def _determine_mimetype(self):
|
|
if os.path.islink(self.src_path):
|
|
# magic will throw an IOError on a broken symlink
|
|
self.mimetype = 'inode/symlink'
|
|
else:
|
|
try:
|
|
mt = magic.from_file(self.src_path, mime=True)
|
|
# magic will always return something, even if it's just 'data'
|
|
except UnicodeEncodeError as e:
|
|
# FIXME: The encoding of the file is broken (possibly UTF-16)
|
|
mt = ''
|
|
self.log_details.update({'UnicodeError': e})
|
|
try:
|
|
self.mimetype = mt.decode("utf-8")
|
|
except:
|
|
self.mimetype = mt
|
|
if self.mimetype and '/' in self.mimetype:
|
|
self.main_type, self.sub_type = self.mimetype.split('/')
|
|
else:
|
|
self.main_type = ''
|
|
self.sub_type = ''
|
|
|
|
def has_mimetype(self):
|
|
"""
|
|
Returns True if file has a full mimetype, else False.
|
|
|
|
Returns False + updates log if self.main_type or self.sub_type
|
|
are not set.
|
|
"""
|
|
|
|
if not self.main_type or not self.sub_type:
|
|
self.log_details.update({'broken_mime': True})
|
|
return False
|
|
return True
|
|
|
|
def has_extension(self):
|
|
"""
|
|
Returns True if self.extension is set, else False.
|
|
|
|
Returns False + updates self.log_details if self.extension is not set.
|
|
"""
|
|
if not self.extension:
|
|
self.log_details.update({'no_extension': True})
|
|
return False
|
|
return True
|
|
|
|
def is_dangerous(self):
|
|
"""Returns True if self.log_details contains 'dangerous'."""
|
|
if self.log_details.get('dangerous'):
|
|
return True
|
|
return False
|
|
|
|
def is_symlink(self):
|
|
"""Returns True and updates log if file is a symlink."""
|
|
if self.has_mimetype() and self.main_type == 'inode' and self.sub_type == 'symlink':
|
|
self.log_details.update({'symlink': os.readlink(self.src_path)})
|
|
return True
|
|
return False
|
|
|
|
def add_log_details(self, key, value):
|
|
"""Takes a key + a value and adds them to self.log_details."""
|
|
self.log_details[key] = value
|
|
|
|
def make_dangerous(self):
|
|
"""
|
|
Marks a file as dangerous.
|
|
|
|
Prepends and appends DANGEROUS to the destination file name
|
|
to avoid double-click of death.
|
|
"""
|
|
if self.is_dangerous():
|
|
# Already marked as dangerous, do nothing
|
|
return
|
|
self.log_details['dangerous'] = True
|
|
path, filename = os.path.split(self.dst_path)
|
|
self.dst_path = os.path.join(path, 'DANGEROUS_{}_DANGEROUS'.format(filename))
|
|
|
|
def make_unknown(self):
|
|
"""Marks a file as an unknown type and prepends UNKNOWN to filename."""
|
|
if self.is_dangerous() or self.log_details.get('binary'):
|
|
# Already marked as dangerous or binary, do nothing
|
|
return
|
|
self.log_details['unknown'] = True
|
|
path, filename = os.path.split(self.dst_path)
|
|
self.dst_path = os.path.join(path, 'UNKNOWN_{}'.format(filename))
|
|
|
|
def make_binary(self):
|
|
"""Marks a file as a binary and appends .bin to filename."""
|
|
if self.is_dangerous():
|
|
# Already marked as dangerous, do nothing
|
|
return
|
|
self.log_details['binary'] = True
|
|
path, filename = os.path.split(self.dst_path)
|
|
self.dst_path = os.path.join(path, '{}.bin'.format(filename))
|
|
|
|
def force_ext(self, ext):
|
|
"""If dst_path does not end in ext, appends the ext and updates log."""
|
|
if not self.dst_path.endswith(ext):
|
|
self.log_details['force_ext'] = True
|
|
self.dst_path += ext
|
|
|
|
|
|
class KittenGroomerBase(object):
|
|
"""Base object responsible for copy/sanitization process."""
|
|
|
|
def __init__(self, root_src, root_dst, debug=False):
|
|
"""Initialized with path to source and dest directories."""
|
|
self.src_root_dir = root_src
|
|
self.dst_root_dir = root_dst
|
|
self.log_root_dir = os.path.join(self.dst_root_dir, 'logs')
|
|
self._safe_rmtree(self.log_root_dir)
|
|
self._safe_mkdir(self.log_root_dir)
|
|
self.log_processing = os.path.join(self.log_root_dir, 'processing.log')
|
|
self.log_content = os.path.join(self.log_root_dir, 'content.log')
|
|
self.tree(self.src_root_dir)
|
|
|
|
quick_setup(file=self.log_processing)
|
|
self.log_name = log.name('files')
|
|
self.resources_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data')
|
|
os.environ["PATH"] += os.pathsep + self.resources_path
|
|
|
|
self.cur_file = None
|
|
|
|
self.debug = debug
|
|
if self.debug:
|
|
self.log_debug_err = os.path.join(self.log_root_dir, 'debug_stderr.log')
|
|
self.log_debug_out = os.path.join(self.log_root_dir, 'debug_stdout.log')
|
|
else:
|
|
self.log_debug_err = os.devnull
|
|
self.log_debug_out = os.devnull
|
|
|
|
def _computehash(self, path):
|
|
"""Returns a sha1 hash of a file at a given path."""
|
|
s = hashlib.sha1()
|
|
with open(path, 'rb') as f:
|
|
while True:
|
|
buf = f.read(0x100000)
|
|
if not buf:
|
|
break
|
|
s.update(buf)
|
|
return s.hexdigest()
|
|
|
|
def tree(self, base_dir, padding=' '):
|
|
"""Writes a graphical tree to the log for a given directory."""
|
|
if sys.version_info.major == 2:
|
|
self.__tree_py2(base_dir, padding)
|
|
else:
|
|
self.__tree_py3(base_dir, padding)
|
|
|
|
def __tree_py2(self, base_dir, padding=' '):
|
|
with open(self.log_content, 'ab') as lf:
|
|
lf.write('#' * 80 + '\n')
|
|
lf.write('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir))))
|
|
padding += '| '
|
|
files = sorted(os.listdir(base_dir))
|
|
for f in files:
|
|
curpath = os.path.join(base_dir, f)
|
|
if os.path.islink(curpath):
|
|
lf.write('{}+-- {}\t- Symbolic link to {}\n'.format(padding, f, os.readlink(curpath)))
|
|
elif os.path.isdir(curpath):
|
|
self.tree(curpath, padding)
|
|
elif os.path.isfile(curpath):
|
|
lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)))
|
|
|
|
def __tree_py3(self, base_dir, padding=' '):
|
|
with open(self.log_content, 'ab') as lf:
|
|
lf.write(bytes('#' * 80 + '\n', 'UTF-8'))
|
|
lf.write(bytes('{}+- {}/\n'.format(padding, os.path.basename(os.path.abspath(base_dir)).encode()), 'utf8'))
|
|
padding += '| '
|
|
files = sorted(os.listdir(base_dir))
|
|
for f in files:
|
|
curpath = os.path.join(base_dir, f)
|
|
if os.path.islink(curpath):
|
|
lf.write('{}+-- {}\t- Symbolic link to {}\n'.format(padding, f, os.readlink(curpath)).encode(errors='ignore'))
|
|
elif os.path.isdir(curpath):
|
|
self.tree(curpath, padding)
|
|
elif os.path.isfile(curpath):
|
|
lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)).encode(errors='ignore'))
|
|
|
|
# ##### Helpers #####
|
|
def _safe_rmtree(self, directory):
|
|
"""Remove a directory tree if it exists."""
|
|
if os.path.exists(directory):
|
|
shutil.rmtree(directory)
|
|
|
|
def _safe_remove(self, filepath):
|
|
"""Remove a file if it exists."""
|
|
if os.path.exists(filepath):
|
|
os.remove(filepath)
|
|
|
|
def _safe_mkdir(self, directory):
|
|
"""Make a directory if it does not exist."""
|
|
if not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
|
|
def _safe_copy(self, src=None, dst=None):
|
|
"""Copy a file and create directory if needed."""
|
|
if src is None:
|
|
src = self.cur_file.src_path
|
|
if dst is None:
|
|
dst = self.cur_file.dst_path
|
|
try:
|
|
dst_path, filename = os.path.split(dst)
|
|
self._safe_mkdir(dst_path)
|
|
shutil.copy(src, dst)
|
|
return True
|
|
except Exception as e:
|
|
# TODO: Logfile
|
|
print(e)
|
|
return False
|
|
|
|
def _safe_metadata_split(self, ext):
|
|
"""Create a separate file to hold this file's metadata."""
|
|
dst = self.cur_file.dst_path
|
|
try:
|
|
if os.path.exists(self.cur_file.src_path + ext): # should we check dst_path as well?
|
|
raise KittenGroomerError("Cannot create split metadata file for \"" +
|
|
self.cur_file.dst_path + "\", type '" +
|
|
ext + "': File exists.")
|
|
dst_path, filename = os.path.split(dst)
|
|
self._safe_mkdir(dst_path)
|
|
return open(dst + ext, 'w+')
|
|
except Exception as e:
|
|
# TODO: Logfile
|
|
print(e)
|
|
return False
|
|
|
|
def _list_all_files(self, directory):
|
|
"""Generate an iterator over all the files in a directory tree."""
|
|
for root, dirs, files in os.walk(directory):
|
|
for filename in files:
|
|
filepath = os.path.join(root, filename)
|
|
yield filepath
|
|
|
|
def _print_log(self):
|
|
"""
|
|
Print log, should be called after each file.
|
|
|
|
You probably want to reimplement it in the subclass.
|
|
"""
|
|
tmp_log = self.log_name.fields(**self.cur_file.log_details)
|
|
tmp_log.info('It did a thing.')
|
|
|
|
#######################
|
|
|
|
def processdir(self, src_dir=None, dst_dir=None):
|
|
"""
|
|
Implement this function in your subclass to define file processing behavior.
|
|
"""
|
|
raise ImplementationRequired('Please implement processdir.')
|
|
|
|
|
|
def main(kg_implementation, description='Call a KittenGroomer implementation to process files present in the source directory and copy them to the destination directory.'):
|
|
parser = argparse.ArgumentParser(prog='KittenGroomer', description=description)
|
|
parser.add_argument('-s', '--source', type=str, help='Source directory')
|
|
parser.add_argument('-d', '--destination', type=str, help='Destination directory')
|
|
args = parser.parse_args()
|
|
kg = kg_implementation(args.source, args.destination)
|
|
kg.processdir()
|