2015-05-11 14:32:59 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2016-12-06 03:02:46 +01:00
"""
Contains the base objects for use when creating a sanitizer using
PyCIRCLean . Subclass FileBase and KittenGroomerBase to implement your
desired behavior .
"""
2015-05-11 14:32:59 +02:00
import os
2015-11-23 19:54:29 +01:00
import hashlib
2015-05-11 14:32:59 +02:00
import shutil
import argparse
2016-12-01 03:04:59 +01:00
import magic
2017-03-18 05:10:17 +01:00
# import twiggy
2016-12-01 03:04:59 +01:00
2015-05-11 14:32:59 +02:00
class KittenGroomerError ( Exception ) :
2016-12-06 03:02:46 +01:00
""" Base KittenGroomer exception handler. """
2015-05-11 14:32:59 +02:00
def __init__ ( self , message ) :
super ( KittenGroomerError , self ) . __init__ ( message )
self . message = message
class ImplementationRequired ( KittenGroomerError ) :
2016-12-06 03:02:46 +01:00
""" Implementation required error. """
2015-05-11 14:32:59 +02:00
pass
class FileBase ( object ) :
2016-12-06 03:02:46 +01:00
"""
2017-03-16 03:29:51 +01:00
Base object for individual files in the source directory .
Contains file attributes and various helper methods .
2016-12-06 03:02:46 +01:00
"""
2015-05-11 14:32:59 +02:00
2017-03-18 03:19:56 +01:00
def __init__ ( self , src_path , dst_path ) :
2017-03-16 03:29:51 +01:00
"""
Initialized with the source path and expected destination path .
Create various properties and determine the file ' s mimetype.
"""
2015-05-11 14:32:59 +02:00
self . src_path = src_path
self . dst_path = dst_path
2017-03-06 21:02:29 +01:00
self . filename = os . path . basename ( self . src_path )
2017-03-09 03:30:06 +01:00
self . _file_props = {
' filepath ' : self . src_path ,
' filename ' : self . filename ,
' file_size ' : self . size ,
2017-03-09 05:06:20 +01:00
' maintype ' : None ,
' subtype ' : None ,
' extension ' : None ,
2017-03-09 03:30:06 +01:00
' safety_category ' : None ,
2017-03-09 05:06:20 +01:00
' symlink ' : False ,
2017-03-09 03:30:06 +01:00
' copied ' : False ,
' file_string_set ' : set ( ) ,
' errors ' : { } ,
' user_defined ' : { }
}
2017-03-09 05:06:20 +01:00
self . extension = self . _determine_extension ( )
self . set_property ( ' extension ' , self . extension )
self . mimetype = self . _determine_mimetype ( )
self . should_copy = True
2017-03-09 05:22:53 +01:00
self . main_type = None
self . sub_type = None
2017-03-09 05:06:20 +01:00
if self . mimetype :
self . main_type , self . sub_type = self . _split_subtypes ( self . mimetype )
2017-03-09 05:22:53 +01:00
if self . main_type :
self . set_property ( ' maintype ' , self . main_type )
if self . sub_type :
self . set_property ( ' subtype ' , self . sub_type )
2015-11-05 14:43:54 +01:00
2016-12-22 16:12:13 +01:00
def _determine_extension ( self ) :
_ , ext = os . path . splitext ( self . src_path )
2017-03-06 20:55:33 +01:00
ext = ext . lower ( )
if ext == ' ' :
ext = None
return ext
2016-12-22 16:12:13 +01:00
2016-12-06 03:02:46 +01:00
def _determine_mimetype ( self ) :
2015-11-24 18:13:41 +01:00
if os . path . islink ( self . src_path ) :
# magic will throw an IOError on a broken symlink
2017-03-03 21:46:37 +01:00
mimetype = ' inode/symlink '
2017-03-09 05:06:20 +01:00
self . set_property ( ' symlink ' , os . readlink ( self . src_path ) )
2015-11-24 18:13:41 +01:00
else :
2016-05-09 19:21:58 +02:00
try :
mt = magic . from_file ( self . src_path , mime = True )
2017-02-24 16:41:59 +01:00
# Note: magic will always return something, even if it's just 'data'
2016-05-09 19:21:58 +02:00
except UnicodeEncodeError as e :
# FIXME: The encoding of the file is broken (possibly UTF-16)
2017-03-14 15:41:31 +01:00
# Note: one of the Travis files will trigger this exception
2017-03-09 03:30:06 +01:00
self . add_error ( e , ' ' )
2017-03-03 21:46:37 +01:00
mt = None
2015-11-24 18:13:41 +01:00
try :
2017-03-03 21:46:37 +01:00
mimetype = mt . decode ( " utf-8 " )
2015-11-24 18:13:41 +01:00
except :
2017-03-03 21:46:37 +01:00
mimetype = mt
return mimetype
def _split_subtypes ( self , mimetype ) :
if ' / ' in mimetype :
main_type , sub_type = mimetype . split ( ' / ' )
2015-11-05 14:43:54 +01:00
else :
2017-03-03 21:46:37 +01:00
main_type , sub_type = None , None
return main_type , sub_type
2015-11-05 14:43:54 +01:00
2017-03-09 03:30:06 +01:00
@property
def size ( self ) :
2017-03-16 03:29:51 +01:00
""" Filesize in bytes as an int, 0 if file does not exist. """
2017-03-09 05:22:53 +01:00
try :
size = os . path . getsize ( self . src_path )
except FileNotFoundError :
size = 0
return size
2017-03-09 03:30:06 +01:00
2017-03-10 19:13:38 +01:00
@property
2015-11-05 14:43:54 +01:00
def has_mimetype ( self ) :
2017-03-16 03:29:51 +01:00
""" True if file has a main and sub mimetype, else False. """
2017-03-14 15:41:31 +01:00
# TODO: broken mimetype checks should be done somewhere else.
# Should the check be by default or should we let the API consumer write it?
2015-11-05 14:43:54 +01:00
if not self . main_type or not self . sub_type :
return False
2017-03-06 21:02:29 +01:00
else :
return True
2015-11-05 14:43:54 +01:00
2017-03-10 19:13:38 +01:00
@property
2015-11-05 14:43:54 +01:00
def has_extension ( self ) :
2017-03-16 03:29:51 +01:00
""" True if self.extension is set, else False. """
2017-03-06 21:02:29 +01:00
if self . extension is None :
2015-11-05 14:43:54 +01:00
return False
2017-03-06 21:02:29 +01:00
else :
return True
2015-11-05 14:43:54 +01:00
2017-03-10 19:13:38 +01:00
@property
2015-11-05 14:43:54 +01:00
def is_dangerous ( self ) :
2017-03-16 03:29:51 +01:00
""" True if file has been marked ' dangerous ' , else False. """
2017-03-09 03:30:06 +01:00
return self . _file_props [ ' safety_category ' ] is ' dangerous '
2016-12-22 00:04:59 +01:00
2017-03-10 19:13:38 +01:00
@property
2016-12-22 00:04:59 +01:00
def is_unknown ( self ) :
2017-03-16 03:29:51 +01:00
""" True if file has been marked ' unknown ' , else False. """
2017-03-09 03:30:06 +01:00
return self . _file_props [ ' safety_category ' ] is ' unknown '
2016-12-22 00:04:59 +01:00
2017-03-10 19:13:38 +01:00
@property
2016-12-22 00:04:59 +01:00
def is_binary ( self ) :
2017-03-16 03:29:51 +01:00
""" True if file has been marked ' binary ' , else False. """
2017-03-09 03:30:06 +01:00
return self . _file_props [ ' safety_category ' ] is ' binary '
2015-05-11 14:32:59 +02:00
2017-03-10 19:13:38 +01:00
@property
2015-11-24 17:45:06 +01:00
def is_symlink ( self ) :
2017-03-16 03:29:51 +01:00
""" True if file is a symlink, else False. """
2017-03-09 05:06:20 +01:00
if self . _file_props [ ' symlink ' ] is False :
2017-03-06 21:02:29 +01:00
return False
2017-03-09 03:30:06 +01:00
else :
return True
2017-03-06 21:02:29 +01:00
def set_property ( self , prop_string , value ) :
2017-03-16 03:29:51 +01:00
"""
Take a property and a value and add them to self . _file_props .
If prop_string is already in _file_props , set prop_string to value .
If prop_string not in _file_props , set prop_string to value in
_file_props [ ' user_defined ' ] .
"""
2017-03-09 03:30:06 +01:00
if prop_string in self . _file_props . keys ( ) :
self . _file_props [ prop_string ] = value
else :
self . _file_props [ ' user_defined ' ] [ prop_string ] = value
2015-11-24 17:45:06 +01:00
2017-03-06 21:02:29 +01:00
def get_property ( self , file_prop ) :
2017-03-16 03:29:51 +01:00
""" Get the value for a property in _file_props. """
2017-03-14 15:41:31 +01:00
# TODO: could probably be refactored
2017-03-09 03:30:06 +01:00
if file_prop in self . _file_props :
return self . _file_props [ file_prop ]
elif file_prop in self . _file_props [ ' user_defined ' ] :
return self . _file_props [ ' user_defined ' ] [ file_prop ]
else :
return None
2017-03-18 05:10:17 +01:00
def get_all_props ( self ) :
""" Return a dict containing all stored properties of this file. """
return self . _file_props
2017-03-09 03:30:06 +01:00
def add_error ( self , error , info ) :
2017-03-16 03:29:51 +01:00
""" Add an error: info pair to _file_props[ ' errors ' ]. """
2017-03-09 03:30:06 +01:00
self . _file_props [ ' errors ' ] . update ( { error : info } )
def add_file_string ( self , file_string ) :
2017-03-16 03:29:51 +01:00
""" Add a file descriptor string to _file_props. """
2017-03-09 03:30:06 +01:00
self . _file_props [ ' file_string_set ' ] . add ( file_string )
2015-05-11 14:32:59 +02:00
2017-03-09 03:30:06 +01:00
def make_dangerous ( self , reason_string = None ) :
2016-12-06 03:02:46 +01:00
"""
2017-03-16 03:29:51 +01:00
Mark file as dangerous .
2016-12-06 03:02:46 +01:00
2017-03-16 03:29:51 +01:00
Prepend and append DANGEROUS to the destination file name
2016-12-22 00:04:59 +01:00
to help prevent double - click of death .
2016-12-06 03:02:46 +01:00
"""
2017-03-10 19:13:38 +01:00
if self . is_dangerous :
2015-05-17 15:58:31 +02:00
return
2017-03-09 03:30:06 +01:00
self . set_property ( ' safety_category ' , ' dangerous ' )
2017-03-14 15:41:31 +01:00
# LOG: store reason string somewhere and do something with it
2015-05-11 14:32:59 +02:00
path , filename = os . path . split ( self . dst_path )
self . dst_path = os . path . join ( path , ' DANGEROUS_ {} _DANGEROUS ' . format ( filename ) )
def make_unknown ( self ) :
2017-03-16 03:29:51 +01:00
""" Mark file as an unknown type and prepend UNKNOWN to filename. """
2017-03-10 19:13:38 +01:00
if self . is_dangerous or self . is_binary :
2015-05-17 15:58:31 +02:00
return
2017-03-09 03:30:06 +01:00
self . set_property ( ' safety_category ' , ' unknown ' )
2015-05-11 14:32:59 +02:00
path , filename = os . path . split ( self . dst_path )
self . dst_path = os . path . join ( path , ' UNKNOWN_ {} ' . format ( filename ) )
def make_binary ( self ) :
2017-03-16 03:29:51 +01:00
""" Mark file as a binary and append .bin to filename. """
2017-03-10 19:13:38 +01:00
if self . is_dangerous :
2015-05-17 15:58:31 +02:00
return
2017-03-09 03:30:06 +01:00
self . set_property ( ' safety_category ' , ' binary ' )
2015-05-11 14:32:59 +02:00
path , filename = os . path . split ( self . dst_path )
self . dst_path = os . path . join ( path , ' {} .bin ' . format ( filename ) )
2017-03-09 19:48:07 +01:00
def safe_copy ( self , src = None , dst = None ) :
2017-03-16 03:29:51 +01:00
""" Copy file and create destination directories if needed. """
2017-03-09 19:48:07 +01:00
if src is None :
src = self . src_path
if dst is None :
dst = self . dst_path
try :
dst_path , filename = os . path . split ( dst )
if not os . path . exists ( dst_path ) :
os . makedirs ( dst_path )
shutil . copy ( src , dst )
except Exception as e :
self . add_error ( e , ' ' )
2015-11-02 17:59:32 +01:00
def force_ext ( self , ext ) :
2017-03-16 03:29:51 +01:00
""" If dst_path does not end in ext, change it and edit _file_props. """
2015-11-02 17:59:32 +01:00
if not self . dst_path . endswith ( ext ) :
2017-03-06 21:02:29 +01:00
self . set_property ( ' force_ext ' , True )
2015-11-02 17:59:32 +01:00
self . dst_path + = ext
2017-03-06 21:02:29 +01:00
if not self . _file_props [ ' extension ' ] == ext :
self . set_property ( ' extension ' , ext )
2015-11-02 17:59:32 +01:00
2017-02-16 23:27:00 +01:00
def create_metadata_file ( self , ext ) :
2017-03-16 03:29:51 +01:00
""" Create a separate file to hold metadata from this file. """
2017-02-16 23:27:00 +01:00
try :
# make sure we aren't overwriting anything
if os . path . exists ( self . src_path + ext ) :
raise KittenGroomerError ( " Cannot create split metadata file for \" " +
self . dst_path + " \" , type ' " +
ext + " ' : File exists. " )
else :
2017-02-22 22:06:51 +01:00
dst_dir_path , filename = os . path . split ( self . dst_path )
if not os . path . exists ( dst_dir_path ) :
os . makedirs ( dst_dir_path )
2017-02-16 23:27:00 +01:00
# TODO: Check extension for leading "."
self . metadata_file_path = self . dst_path + ext
return self . metadata_file_path
except KittenGroomerError as e :
2017-03-09 03:30:06 +01:00
self . add_error ( e , ' ' )
2017-02-16 23:27:00 +01:00
return False
2015-05-11 14:32:59 +02:00
2017-02-27 17:58:10 +01:00
class GroomerLogger ( object ) :
2017-03-14 15:41:31 +01:00
""" Groomer logging interface. """
2017-02-21 01:03:11 +01:00
2017-03-06 21:02:29 +01:00
def __init__ ( self , root_dir_path , debug = False ) :
self . root_dir = root_dir_path
2017-03-17 20:14:18 +01:00
self . log_dir_path = self . _make_log_dir ( root_dir_path )
2017-03-18 05:10:17 +01:00
self . log_path = os . path . join ( self . log_dir_path , ' log.txt ' )
# twiggy.quick_setup(file=self.log_processing)
# self.log = twiggy.log.name('files')
2017-02-21 01:03:11 +01:00
if debug :
self . log_debug_err = os . path . join ( self . log_dir_path , ' debug_stderr.log ' )
self . log_debug_out = os . path . join ( self . log_dir_path , ' debug_stdout.log ' )
2015-11-04 11:06:57 +01:00
else :
self . log_debug_err = os . devnull
self . log_debug_out = os . devnull
2017-03-17 20:14:18 +01:00
def _make_log_dir ( self , root_dir_path ) :
log_dir_path = os . path . join ( root_dir_path , ' logs ' )
if os . path . exists ( log_dir_path ) :
shutil . rmtree ( log_dir_path )
os . makedirs ( log_dir_path )
return log_dir_path
2015-11-23 19:54:29 +01:00
def tree ( self , base_dir , padding = ' ' ) :
2017-03-16 03:29:51 +01:00
""" Write a graphical tree to the log for `base_dir`. """
2017-03-18 05:10:17 +01:00
with open ( self . log_path , ' ab ' ) as lf :
2016-05-09 19:21:58 +02:00
lf . write ( bytes ( ' # ' * 80 + ' \n ' , ' UTF-8 ' ) )
lf . write ( bytes ( ' {} +- {} / \n ' . format ( padding , os . path . basename ( os . path . abspath ( base_dir ) ) . encode ( ) ) , ' utf8 ' ) )
padding + = ' | '
files = sorted ( os . listdir ( base_dir ) )
for f in files :
curpath = os . path . join ( base_dir , f )
if os . path . islink ( curpath ) :
lf . write ( ' {} +-- {} \t - Symbolic link to {} \n ' . format ( padding , f , os . readlink ( curpath ) ) . encode ( errors = ' ignore ' ) )
elif os . path . isdir ( curpath ) :
self . tree ( curpath , padding )
elif os . path . isfile ( curpath ) :
lf . write ( ' {} +-- {} \t - {} \n ' . format ( padding , f , self . _computehash ( curpath ) ) . encode ( errors = ' ignore ' ) )
2017-02-21 01:03:11 +01:00
def _computehash ( self , path ) :
2017-03-16 03:29:51 +01:00
""" Return a sha256 hash of a file at a given path. """
2017-02-21 01:03:11 +01:00
s = hashlib . sha256 ( )
with open ( path , ' rb ' ) as f :
while True :
buf = f . read ( 0x100000 )
if not buf :
break
s . update ( buf )
return s . hexdigest ( )
2017-03-18 05:10:17 +01:00
def add_file ( self , file_props ) :
""" Add a file to the log. Takes a dict of file properties. """
2017-03-18 03:19:56 +01:00
pass
2017-02-21 04:38:41 +01:00
2017-02-21 01:03:11 +01:00
class KittenGroomerBase ( object ) :
""" Base object responsible for copy/sanitization process. """
2017-03-18 03:19:56 +01:00
def __init__ ( self , src_root_dir , dst_root_dir ) :
2017-02-21 01:03:11 +01:00
""" Initialized with path to source and dest directories. """
2017-03-18 02:00:11 +01:00
self . src_root_dir = src_root_dir
self . dst_root_dir = dst_root_dir
2017-02-21 01:03:11 +01:00
2017-03-07 21:31:32 +01:00
def safe_rmtree ( self , directory ) :
2016-12-06 18:43:28 +01:00
""" Remove a directory tree if it exists. """
2015-05-11 14:32:59 +02:00
if os . path . exists ( directory ) :
shutil . rmtree ( directory )
2017-03-07 21:31:32 +01:00
def safe_remove ( self , filepath ) :
2016-12-06 18:43:28 +01:00
""" Remove a file if it exists. """
2015-05-11 14:32:59 +02:00
if os . path . exists ( filepath ) :
os . remove ( filepath )
2017-03-07 21:31:32 +01:00
def safe_mkdir ( self , directory ) :
2016-12-06 18:43:28 +01:00
""" Make a directory if it does not exist. """
2015-05-11 14:32:59 +02:00
if not os . path . exists ( directory ) :
os . makedirs ( directory )
2017-02-24 16:43:42 +01:00
def list_all_files ( self , directory ) :
2017-02-16 23:27:00 +01:00
""" Generator yielding path to all of the files in a directory tree. """
2015-05-11 14:32:59 +02:00
for root , dirs , files in os . walk ( directory ) :
for filename in files :
filepath = os . path . join ( root , filename )
yield filepath
#######################
2017-02-23 22:49:29 +01:00
# TODO: feels like this function doesn't need to exist if we move main()
2017-02-21 01:03:11 +01:00
def processdir ( self , src_dir , dst_dir ) :
2017-03-16 03:29:51 +01:00
""" Implement this function to define file processing behavior. """
2016-12-06 18:43:28 +01:00
raise ImplementationRequired ( ' Please implement processdir. ' )
2015-05-11 14:32:59 +02:00
2017-02-21 04:38:41 +01:00
# TODO: Maybe this shouldn't exist? It should probably get moved to filecheck since this isn't really API code
2016-12-06 18:43:28 +01:00
def main ( kg_implementation , description = ' Call a KittenGroomer implementation to process files present in the source directory and copy them to the destination directory. ' ) :
2015-05-11 14:32:59 +02:00
parser = argparse . ArgumentParser ( prog = ' KittenGroomer ' , description = description )
parser . add_argument ( ' -s ' , ' --source ' , type = str , help = ' Source directory ' )
parser . add_argument ( ' -d ' , ' --destination ' , type = str , help = ' Destination directory ' )
args = parser . parse_args ( )
kg = kg_implementation ( args . source , args . destination )
kg . processdir ( )