2015-05-11 14:32:59 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2016-12-06 03:02:46 +01:00
"""
Contains the base objects for use when creating a sanitizer using
2017-04-10 13:18:27 +02:00
PyCIRCLean . Subclass or import from FileBase / KittenGroomerBase and implement
your desired behavior .
2016-12-06 03:02:46 +01:00
"""
2015-05-11 14:32:59 +02:00
import os
2015-11-23 19:54:29 +01:00
import hashlib
2015-05-11 14:32:59 +02:00
import shutil
import argparse
2016-12-01 03:04:59 +01:00
import magic
2015-05-11 14:32:59 +02:00
class FileBase ( object ) :
2016-12-06 03:02:46 +01:00
"""
2017-03-16 03:29:51 +01:00
Base object for individual files in the source directory .
Contains file attributes and various helper methods .
2016-12-06 03:02:46 +01:00
"""
2015-05-11 14:32:59 +02:00
2017-03-18 03:19:56 +01:00
def __init__ ( self , src_path , dst_path ) :
2017-03-16 03:29:51 +01:00
"""
Initialized with the source path and expected destination path .
Create various properties and determine the file ' s mimetype.
"""
2015-05-11 14:32:59 +02:00
self . src_path = src_path
2017-07-12 23:58:39 +02:00
self . dst_dir = os . path . dirname ( dst_path )
self . filename = os . path . basename ( src_path )
self . size = self . _get_size ( src_path )
2017-07-17 20:52:22 +02:00
self . is_dangerous = False
2017-07-12 23:58:39 +02:00
self . copied = False
self . symlink_path = None
self . description_string = [ ] # array of descriptions to be joined
self . _errors = { }
self . _user_defined = { }
2017-03-09 05:06:20 +01:00
self . should_copy = True
2017-07-12 23:58:39 +02:00
self . mimetype = self . _determine_mimetype ( src_path )
@property
def dst_path ( self ) :
return os . path . join ( self . dst_dir , self . filename )
@property
def extension ( self ) :
_ , ext = os . path . splitext ( self . filename )
2017-03-06 20:55:33 +01:00
if ext == ' ' :
2017-07-17 16:10:21 +02:00
return None
else :
return ext . lower ( )
2016-12-22 16:12:13 +01:00
2017-07-12 23:58:39 +02:00
@property
def maintype ( self ) :
main , _ = self . _split_mimetype ( self . mimetype )
return main
2015-11-05 14:43:54 +01:00
2017-03-09 03:30:06 +01:00
@property
2017-07-12 23:58:39 +02:00
def subtype ( self ) :
_ , sub = self . _split_mimetype ( self . mimetype )
return sub
2017-03-09 03:30:06 +01:00
2017-03-10 19:13:38 +01:00
@property
2015-11-05 14:43:54 +01:00
def has_mimetype ( self ) :
2017-03-16 03:29:51 +01:00
""" True if file has a main and sub mimetype, else False. """
2017-07-12 23:58:39 +02:00
if not self . maintype or not self . subtype :
2015-11-05 14:43:54 +01:00
return False
2017-03-06 21:02:29 +01:00
else :
return True
2015-11-05 14:43:54 +01:00
2017-03-10 19:13:38 +01:00
@property
2015-11-05 14:43:54 +01:00
def has_extension ( self ) :
2017-03-16 03:29:51 +01:00
""" True if self.extension is set, else False. """
2017-03-06 21:02:29 +01:00
if self . extension is None :
2015-11-05 14:43:54 +01:00
return False
2017-03-06 21:02:29 +01:00
else :
return True
2015-11-05 14:43:54 +01:00
2017-03-10 19:13:38 +01:00
@property
2015-11-24 17:45:06 +01:00
def is_symlink ( self ) :
2017-07-11 20:31:24 +02:00
""" True if file is a symlink, else False. """
2017-07-12 23:58:39 +02:00
if self . symlink_path is None :
2017-03-06 21:02:29 +01:00
return False
2017-03-09 03:30:06 +01:00
else :
return True
2017-03-06 21:02:29 +01:00
2017-07-14 22:41:44 +02:00
@property
def description_string ( self ) :
return self . __description_string
@description_string.setter
def description_string ( self , value ) :
if hasattr ( self , ' description_string ' ) :
if isinstance ( value , str ) :
if value not in self . __description_string :
self . __description_string . append ( value )
else :
raise TypeError ( " Description_string can only include strings " )
else :
self . __description_string = value
2017-03-06 21:02:29 +01:00
def set_property ( self , prop_string , value ) :
2017-03-16 03:29:51 +01:00
"""
2017-07-12 23:58:39 +02:00
Take a property and a value and add them to the file ' s stored props.
2017-03-16 03:29:51 +01:00
2017-03-21 00:39:37 +01:00
If ` prop_string ` is part of the file property API , set it to ` value ` .
Otherwise , add ` prop_string ` : ` value ` to ` user_defined ` properties .
2017-07-12 23:58:39 +02:00
TODO : rewrite docstring
2017-03-16 03:29:51 +01:00
"""
2017-07-12 23:58:39 +02:00
if hasattr ( self , prop_string ) :
setattr ( self , prop_string , value )
2017-03-09 03:30:06 +01:00
else :
2017-07-14 22:41:44 +02:00
self . _user_defined [ prop_string ] = value
2015-11-24 17:45:06 +01:00
2017-03-21 00:39:37 +01:00
def get_property ( self , prop_string ) :
"""
Get the value for a property stored on the file .
Returns ` None ` if ` prop_string ` cannot be found on the file .
"""
2017-07-12 23:58:39 +02:00
try :
return getattr ( self , prop_string )
except AttributeError :
return self . _user_defined . get ( prop_string , None )
2017-03-09 03:30:06 +01:00
2017-03-18 05:10:17 +01:00
def get_all_props ( self ) :
""" Return a dict containing all stored properties of this file. """
2017-07-12 23:58:39 +02:00
# Maybe move this onto the logger? I think that makes more sense
props_dict = {
' filepath ' : self . src_path ,
' filename ' : self . filename ,
' file_size ' : self . size ,
' mimetype ' : self . mimetype ,
' maintype ' : self . maintype ,
' subtype ' : self . subtype ,
' extension ' : self . extension ,
' is_dangerous ' : self . is_dangerous ,
' symlink_path ' : self . symlink_path ,
' copied ' : self . copied ,
' description_string ' : self . description_string ,
' errors ' : self . _errors ,
' user_defined ' : self . _user_defined
}
return props_dict
2017-03-18 05:10:17 +01:00
2017-03-21 00:39:37 +01:00
def add_error ( self , error , info_string ) :
""" Add an `error`: `info_string` pair to the file. """
2017-07-12 23:58:39 +02:00
self . _errors . update ( { error : info_string } )
2017-03-09 03:30:06 +01:00
2017-03-22 15:28:00 +01:00
def add_description ( self , description_string ) :
"""
Add a description string to the file .
If ` description_string ` is already present , will prevent duplicates .
"""
self . set_property ( ' description_string ' , description_string )
2015-05-11 14:32:59 +02:00
2017-03-09 03:30:06 +01:00
def make_dangerous ( self , reason_string = None ) :
2016-12-06 03:02:46 +01:00
"""
2017-03-16 03:29:51 +01:00
Mark file as dangerous .
2016-12-06 03:02:46 +01:00
2017-03-16 03:29:51 +01:00
Prepend and append DANGEROUS to the destination file name
2016-12-22 00:04:59 +01:00
to help prevent double - click of death .
2016-12-06 03:02:46 +01:00
"""
2017-07-12 23:58:39 +02:00
if not self . is_dangerous :
2017-07-14 23:37:30 +02:00
self . set_property ( ' is_dangerous ' , True )
2017-07-13 18:55:26 +02:00
self . filename = ' DANGEROUS_ {} _DANGEROUS ' . format ( self . filename )
if reason_string :
self . add_description ( reason_string )
2015-05-11 14:32:59 +02:00
2017-03-09 19:48:07 +01:00
def safe_copy ( self , src = None , dst = None ) :
2017-03-16 03:29:51 +01:00
""" Copy file and create destination directories if needed. """
2017-03-09 19:48:07 +01:00
if src is None :
src = self . src_path
if dst is None :
dst = self . dst_path
try :
2017-07-17 20:52:22 +02:00
os . makedirs ( self . dst_dir , exist_ok = True )
2017-03-09 19:48:07 +01:00
shutil . copy ( src , dst )
2017-07-17 20:52:22 +02:00
except IOError as e :
# Probably means we can't write in the dest dir
2017-03-09 19:48:07 +01:00
self . add_error ( e , ' ' )
2017-07-12 23:58:39 +02:00
def force_ext ( self , extension ) :
""" If dst_path does not end in `extension`, append .ext to it. """
new_ext = self . _check_leading_dot ( extension )
if not self . filename . endswith ( new_ext ) :
2017-07-17 20:52:22 +02:00
# TODO: log that the extension was changed
2017-07-12 23:58:39 +02:00
self . filename + = new_ext
if not self . get_property ( ' extension ' ) == new_ext :
self . set_property ( ' extension ' , new_ext )
def create_metadata_file ( self , extension ) :
# TODO: this method name is confusing
2017-03-21 00:39:37 +01:00
"""
Create a separate file to hold extracted metadata .
2017-07-12 23:58:39 +02:00
The string ` extension ` will be used as the extension for the file .
2017-03-21 00:39:37 +01:00
"""
2017-07-12 23:58:39 +02:00
ext = self . _check_leading_dot ( extension )
2017-02-16 23:27:00 +01:00
try :
2017-07-12 23:58:39 +02:00
# Prevent using the same path as another file from src_path
2017-02-16 23:27:00 +01:00
if os . path . exists ( self . src_path + ext ) :
2017-07-12 23:58:39 +02:00
raise KittenGroomerError (
" Could not create metadata file for \" " +
self . filename +
" \" : a file with that path exists. " )
2017-02-16 23:27:00 +01:00
else :
2017-07-17 20:52:22 +02:00
os . makedirs ( self . dst_dir , exist_ok = True )
2017-07-12 23:58:39 +02:00
# TODO: shouldn't mutate state and also return something
2017-02-16 23:27:00 +01:00
self . metadata_file_path = self . dst_path + ext
return self . metadata_file_path
2017-07-12 23:58:39 +02:00
# TODO: can probably let this exception bubble up
2017-02-16 23:27:00 +01:00
except KittenGroomerError as e :
2017-03-09 03:30:06 +01:00
self . add_error ( e , ' ' )
2017-02-16 23:27:00 +01:00
return False
2017-03-21 00:39:37 +01:00
def _check_leading_dot ( self , ext ) :
2017-07-12 23:58:39 +02:00
# TODO: this method name is confusing
2017-03-21 00:39:37 +01:00
if len ( ext ) > 0 :
if not ext . startswith ( ' . ' ) :
return ' . ' + ext
return ext
2017-07-12 23:58:39 +02:00
def _determine_mimetype ( self , file_path ) :
if os . path . islink ( file_path ) :
# libmagic will throw an IOError on a broken symlink
mimetype = ' inode/symlink '
2017-07-13 23:36:43 +02:00
self . set_property ( ' symlink_path ' , os . readlink ( file_path ) )
2017-07-12 23:58:39 +02:00
else :
try :
mt = magic . from_file ( file_path , mime = True )
# libmagic will always return something, even if it's just 'data'
except UnicodeEncodeError as e :
# FIXME: The encoding of the file that triggers this is broken (possibly it's UTF-16 and Python expects utf8)
# Note: one of the Travis files will trigger this exception
self . add_error ( e , ' ' )
mt = None
try :
mimetype = mt . decode ( " utf-8 " )
except :
# FIXME: what should the exception be here if mimetype isn't utf-8?
mimetype = mt
return mimetype
def _split_mimetype ( self , mimetype ) :
2017-07-16 20:25:16 +02:00
if mimetype and ' / ' in mimetype :
2017-07-12 23:58:39 +02:00
main_type , sub_type = mimetype . split ( ' / ' )
else :
main_type , sub_type = None , None
return main_type , sub_type
def _get_size ( self , file_path ) :
""" Filesize in bytes as an int, 0 if file does not exist. """
try :
size = os . path . getsize ( file_path )
except FileNotFoundError :
size = 0
return size
2015-05-11 14:32:59 +02:00
2017-04-10 13:07:21 +02:00
class Logging ( object ) :
2017-02-21 01:03:11 +01:00
2017-04-10 13:07:21 +02:00
@staticmethod
def computehash ( path ) :
""" Return the sha256 hash of a file at a given path. """
2017-02-21 01:03:11 +01:00
s = hashlib . sha256 ( )
with open ( path , ' rb ' ) as f :
while True :
buf = f . read ( 0x100000 )
if not buf :
break
s . update ( buf )
return s . hexdigest ( )
class KittenGroomerBase ( object ) :
""" Base object responsible for copy/sanitization process. """
2017-03-21 00:39:37 +01:00
def __init__ ( self , src_root_path , dst_root_path ) :
2017-02-21 01:03:11 +01:00
""" Initialized with path to source and dest directories. """
2017-04-10 13:19:30 +02:00
self . src_root_path = os . path . abspath ( src_root_path )
self . dst_root_path = os . path . abspath ( dst_root_path )
2017-02-21 01:03:11 +01:00
2017-03-21 00:39:37 +01:00
def safe_rmtree ( self , directory_path ) :
2016-12-06 18:43:28 +01:00
""" Remove a directory tree if it exists. """
2017-03-21 00:39:37 +01:00
if os . path . exists ( directory_path ) :
shutil . rmtree ( directory_path )
2015-05-11 14:32:59 +02:00
2017-03-21 00:39:37 +01:00
def safe_remove ( self , file_path ) :
""" Remove file at file_path if it exists. """
if os . path . exists ( file_path ) :
os . remove ( file_path )
2015-05-11 14:32:59 +02:00
2017-03-21 00:39:37 +01:00
def safe_mkdir ( self , directory_path ) :
2016-12-06 18:43:28 +01:00
""" Make a directory if it does not exist. """
2017-03-21 00:39:37 +01:00
if not os . path . exists ( directory_path ) :
os . makedirs ( directory_path )
2015-05-11 14:32:59 +02:00
2017-03-21 00:39:37 +01:00
def list_all_files ( self , directory_path ) :
2017-02-16 23:27:00 +01:00
""" Generator yielding path to all of the files in a directory tree. """
2017-03-21 00:39:37 +01:00
for root , dirs , files in os . walk ( directory_path ) :
2017-04-10 13:18:27 +02:00
# files is a list anyway so we don't get much from using a generator here
2015-05-11 14:32:59 +02:00
for filename in files :
filepath = os . path . join ( root , filename )
yield filepath
#######################
2017-02-21 01:03:11 +01:00
def processdir ( self , src_dir , dst_dir ) :
2017-03-16 03:29:51 +01:00
""" Implement this function to define file processing behavior. """
2016-12-06 18:43:28 +01:00
raise ImplementationRequired ( ' Please implement processdir. ' )
2015-05-11 14:32:59 +02:00
2017-07-11 20:45:12 +02:00
class KittenGroomerError ( Exception ) :
""" Base KittenGroomer exception handler. """
def __init__ ( self , message ) :
super ( KittenGroomerError , self ) . __init__ ( message )
self . message = message
class ImplementationRequired ( KittenGroomerError ) :
""" Implementation required error. """
pass
2016-12-06 18:43:28 +01:00
def main ( kg_implementation , description = ' Call a KittenGroomer implementation to process files present in the source directory and copy them to the destination directory. ' ) :
2015-05-11 14:32:59 +02:00
parser = argparse . ArgumentParser ( prog = ' KittenGroomer ' , description = description )
parser . add_argument ( ' -s ' , ' --source ' , type = str , help = ' Source directory ' )
parser . add_argument ( ' -d ' , ' --destination ' , type = str , help = ' Destination directory ' )
args = parser . parse_args ( )
kg = kg_implementation ( args . source , args . destination )
kg . processdir ( )