2015-05-11 14:32:59 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2016-12-06 03:02:46 +01:00
"""
Contains the base objects for use when creating a sanitizer using
PyCIRCLean . Subclass FileBase and KittenGroomerBase to implement your
desired behavior .
"""
2015-05-11 14:32:59 +02:00
import os
2015-11-23 19:54:29 +01:00
import hashlib
2015-05-11 14:32:59 +02:00
import shutil
import argparse
2016-12-01 03:04:59 +01:00
import magic
2017-02-21 01:03:11 +01:00
import twiggy
2016-12-01 03:04:59 +01:00
2015-05-11 14:32:59 +02:00
class KittenGroomerError ( Exception ) :
2016-12-06 03:02:46 +01:00
""" Base KittenGroomer exception handler. """
2015-05-11 14:32:59 +02:00
def __init__ ( self , message ) :
super ( KittenGroomerError , self ) . __init__ ( message )
self . message = message
class ImplementationRequired ( KittenGroomerError ) :
2016-12-06 03:02:46 +01:00
""" Implementation required error. """
2015-05-11 14:32:59 +02:00
pass
class FileBase ( object ) :
2016-12-06 03:02:46 +01:00
"""
2016-12-06 18:43:28 +01:00
Base object for individual files in the source directory . Contains file
attributes and various helper methods . Subclass and add attributes
or methods relevant to a given implementation .
2016-12-06 03:02:46 +01:00
"""
2015-05-11 14:32:59 +02:00
2017-02-21 01:03:11 +01:00
def __init__ ( self , src_path , dst_path , logger = None ) :
2016-12-06 18:43:28 +01:00
""" Initialized with the source path and expected destination path. """
2015-05-11 14:32:59 +02:00
self . src_path = src_path
self . dst_path = dst_path
self . log_details = { ' filepath ' : self . src_path }
self . log_string = ' '
2016-12-22 16:12:13 +01:00
self . _determine_extension ( )
2016-12-06 03:02:46 +01:00
self . _determine_mimetype ( )
2017-02-21 01:03:11 +01:00
self . logger = logger
2015-11-05 14:43:54 +01:00
2016-12-22 16:12:13 +01:00
def _determine_extension ( self ) :
_ , ext = os . path . splitext ( self . src_path )
self . extension = ext . lower ( )
2016-12-06 03:02:46 +01:00
def _determine_mimetype ( self ) :
2015-11-24 18:13:41 +01:00
if os . path . islink ( self . src_path ) :
# magic will throw an IOError on a broken symlink
self . mimetype = ' inode/symlink '
else :
2016-05-09 19:21:58 +02:00
try :
mt = magic . from_file ( self . src_path , mime = True )
2016-12-14 22:32:58 +01:00
# magic will always return something, even if it's just 'data'
2016-05-09 19:21:58 +02:00
except UnicodeEncodeError as e :
# FIXME: The encoding of the file is broken (possibly UTF-16)
mt = ' '
self . log_details . update ( { ' UnicodeError ' : e } )
2015-11-24 18:13:41 +01:00
try :
self . mimetype = mt . decode ( " utf-8 " )
except :
self . mimetype = mt
2015-11-05 14:43:54 +01:00
if self . mimetype and ' / ' in self . mimetype :
self . main_type , self . sub_type = self . mimetype . split ( ' / ' )
else :
self . main_type = ' '
self . sub_type = ' '
def has_mimetype ( self ) :
2016-12-06 03:02:46 +01:00
"""
Returns True if file has a full mimetype , else False .
Returns False + updates log if self . main_type or self . sub_type
are not set .
"""
2015-11-05 14:43:54 +01:00
if not self . main_type or not self . sub_type :
2015-11-24 11:49:28 +01:00
self . log_details . update ( { ' broken_mime ' : True } )
2015-11-05 14:43:54 +01:00
return False
return True
def has_extension ( self ) :
2016-12-06 03:02:46 +01:00
"""
Returns True if self . extension is set , else False .
Returns False + updates self . log_details if self . extension is not set .
"""
2016-12-22 00:04:59 +01:00
if self . extension == ' ' :
2015-11-24 11:49:28 +01:00
self . log_details . update ( { ' no_extension ' : True } )
2015-11-05 14:43:54 +01:00
return False
return True
def is_dangerous ( self ) :
2016-12-06 03:02:46 +01:00
""" Returns True if self.log_details contains ' dangerous ' . """
2016-12-22 00:04:59 +01:00
return ( ' dangerous ' in self . log_details )
def is_unknown ( self ) :
""" Returns True if self.log_details contains ' unknown ' . """
return ( ' unknown ' in self . log_details )
def is_binary ( self ) :
""" returns True if self.log_details contains ' binary ' . """
return ( ' binary ' in self . log_details )
2015-05-11 14:32:59 +02:00
2015-11-24 17:45:06 +01:00
def is_symlink ( self ) :
2016-12-06 03:02:46 +01:00
""" Returns True and updates log if file is a symlink. """
2015-11-24 17:45:06 +01:00
if self . has_mimetype ( ) and self . main_type == ' inode ' and self . sub_type == ' symlink ' :
self . log_details . update ( { ' symlink ' : os . readlink ( self . src_path ) } )
return True
return False
2015-05-11 14:32:59 +02:00
def add_log_details ( self , key , value ) :
2016-12-06 03:02:46 +01:00
""" Takes a key + a value and adds them to self.log_details. """
2015-05-11 14:32:59 +02:00
self . log_details [ key ] = value
def make_dangerous ( self ) :
2016-12-06 03:02:46 +01:00
"""
Marks a file as dangerous .
Prepends and appends DANGEROUS to the destination file name
2016-12-22 00:04:59 +01:00
to help prevent double - click of death .
2016-12-06 03:02:46 +01:00
"""
2015-11-05 14:43:54 +01:00
if self . is_dangerous ( ) :
2015-05-17 15:58:31 +02:00
return
2015-05-11 14:32:59 +02:00
self . log_details [ ' dangerous ' ] = True
path , filename = os . path . split ( self . dst_path )
self . dst_path = os . path . join ( path , ' DANGEROUS_ {} _DANGEROUS ' . format ( filename ) )
def make_unknown ( self ) :
2016-12-06 03:02:46 +01:00
""" Marks a file as an unknown type and prepends UNKNOWN to filename. """
2016-12-22 00:04:59 +01:00
if self . is_dangerous ( ) or self . is_binary ( ) :
2015-05-17 15:58:31 +02:00
return
2015-05-11 14:32:59 +02:00
self . log_details [ ' unknown ' ] = True
path , filename = os . path . split ( self . dst_path )
self . dst_path = os . path . join ( path , ' UNKNOWN_ {} ' . format ( filename ) )
def make_binary ( self ) :
2016-12-06 03:02:46 +01:00
""" Marks a file as a binary and appends .bin to filename. """
2015-11-05 14:43:54 +01:00
if self . is_dangerous ( ) :
2015-05-17 15:58:31 +02:00
return
2015-05-11 14:32:59 +02:00
self . log_details [ ' binary ' ] = True
path , filename = os . path . split ( self . dst_path )
self . dst_path = os . path . join ( path , ' {} .bin ' . format ( filename ) )
2015-11-02 17:59:32 +01:00
def force_ext ( self , ext ) :
2016-12-06 03:02:46 +01:00
""" If dst_path does not end in ext, appends the ext and updates log. """
2015-11-02 17:59:32 +01:00
if not self . dst_path . endswith ( ext ) :
self . log_details [ ' force_ext ' ] = True
self . dst_path + = ext
2017-02-16 23:27:00 +01:00
def create_metadata_file ( self , ext ) :
""" Create a separate file to hold this file ' s metadata. """
try :
# make sure we aren't overwriting anything
if os . path . exists ( self . src_path + ext ) :
raise KittenGroomerError ( " Cannot create split metadata file for \" " +
self . dst_path + " \" , type ' " +
ext + " ' : File exists. " )
else :
# TODO: Uncomment these after object relationships are fixed
# dst_dir_path, filename = os.path.split(self.dst_path)
# self._safe_mkdir(dst_dir_path)
# TODO: Check extension for leading "."
self . metadata_file_path = self . dst_path + ext
return self . metadata_file_path
except KittenGroomerError as e :
# TODO: Write to log file
return False
2015-05-11 14:32:59 +02:00
2017-02-21 01:03:11 +01:00
class GroomerLog ( object ) :
""" Groomer logging object """
def __init__ ( self , root_dir , debug = False ) :
self . log_dir_path = os . path . join ( root_dir , ' logs ' )
if os . path . exists ( self . log_dir_path ) :
shutil . rmtree ( self . log_dir_path )
os . makedirs ( self . log_dir_path )
self . log_processing = os . path . join ( self . log_dir_path , ' processing.log ' )
self . log_content = os . path . join ( self . log_dir_path , ' content.log ' )
twiggy . quick_setup ( file = self . log_processing )
self . log = twiggy . log . name ( ' files ' )
if debug :
self . log_debug_err = os . path . join ( self . log_dir_path , ' debug_stderr.log ' )
self . log_debug_out = os . path . join ( self . log_dir_path , ' debug_stdout.log ' )
2015-11-04 11:06:57 +01:00
else :
self . log_debug_err = os . devnull
self . log_debug_out = os . devnull
2015-11-23 19:54:29 +01:00
def tree ( self , base_dir , padding = ' ' ) :
2016-12-06 18:43:28 +01:00
""" Writes a graphical tree to the log for a given directory. """
2016-05-09 19:21:58 +02:00
with open ( self . log_content , ' ab ' ) as lf :
lf . write ( bytes ( ' # ' * 80 + ' \n ' , ' UTF-8 ' ) )
lf . write ( bytes ( ' {} +- {} / \n ' . format ( padding , os . path . basename ( os . path . abspath ( base_dir ) ) . encode ( ) ) , ' utf8 ' ) )
padding + = ' | '
files = sorted ( os . listdir ( base_dir ) )
for f in files :
curpath = os . path . join ( base_dir , f )
if os . path . islink ( curpath ) :
lf . write ( ' {} +-- {} \t - Symbolic link to {} \n ' . format ( padding , f , os . readlink ( curpath ) ) . encode ( errors = ' ignore ' ) )
elif os . path . isdir ( curpath ) :
self . tree ( curpath , padding )
elif os . path . isfile ( curpath ) :
lf . write ( ' {} +-- {} \t - {} \n ' . format ( padding , f , self . _computehash ( curpath ) ) . encode ( errors = ' ignore ' ) )
2017-02-21 01:03:11 +01:00
def _computehash ( self , path ) :
""" Returns a sha256 hash of a file at a given path. """
s = hashlib . sha256 ( )
with open ( path , ' rb ' ) as f :
while True :
buf = f . read ( 0x100000 )
if not buf :
break
s . update ( buf )
return s . hexdigest ( )
class KittenGroomerBase ( object ) :
""" Base object responsible for copy/sanitization process. """
def __init__ ( self , root_src , root_dst , debug = False ) :
""" Initialized with path to source and dest directories. """
self . src_root_dir = root_src
self . dst_root_dir = root_dst
self . debug = debug
self . cur_file = None
self . logger = GroomerLog ( self . dst_root_dir , debug )
# Add data/ to PATH
# self.resources_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data')
# os.environ["PATH"] += os.pathsep + self.resources_path
2015-05-11 14:32:59 +02:00
# ##### Helpers #####
def _safe_rmtree ( self , directory ) :
2016-12-06 18:43:28 +01:00
""" Remove a directory tree if it exists. """
2015-05-11 14:32:59 +02:00
if os . path . exists ( directory ) :
shutil . rmtree ( directory )
def _safe_remove ( self , filepath ) :
2016-12-06 18:43:28 +01:00
""" Remove a file if it exists. """
2015-05-11 14:32:59 +02:00
if os . path . exists ( filepath ) :
os . remove ( filepath )
def _safe_mkdir ( self , directory ) :
2016-12-06 18:43:28 +01:00
""" Make a directory if it does not exist. """
2015-05-11 14:32:59 +02:00
if not os . path . exists ( directory ) :
os . makedirs ( directory )
2015-05-31 15:36:36 +02:00
def _safe_copy ( self , src = None , dst = None ) :
2016-12-06 18:43:28 +01:00
""" Copy a file and create directory if needed. """
2015-05-31 15:36:36 +02:00
if src is None :
src = self . cur_file . src_path
if dst is None :
dst = self . cur_file . dst_path
2015-05-11 14:32:59 +02:00
try :
2015-05-31 15:36:36 +02:00
dst_path , filename = os . path . split ( dst )
2015-05-11 14:32:59 +02:00
self . _safe_mkdir ( dst_path )
2015-05-31 15:36:36 +02:00
shutil . copy ( src , dst )
2015-05-11 14:32:59 +02:00
return True
except Exception as e :
# TODO: Logfile
print ( e )
return False
2017-02-21 01:03:11 +01:00
# TODO: this isn't a private method, change and edit the groomers as well
2015-05-11 14:32:59 +02:00
def _list_all_files ( self , directory ) :
2017-02-16 23:27:00 +01:00
""" Generator yielding path to all of the files in a directory tree. """
2015-05-11 14:32:59 +02:00
for root , dirs , files in os . walk ( directory ) :
for filename in files :
filepath = os . path . join ( root , filename )
yield filepath
#######################
2017-02-21 01:03:11 +01:00
def processdir ( self , src_dir , dst_dir ) :
2016-12-06 18:43:28 +01:00
"""
Implement this function in your subclass to define file processing behavior .
"""
raise ImplementationRequired ( ' Please implement processdir. ' )
2015-05-11 14:32:59 +02:00
2016-12-06 18:43:28 +01:00
def main ( kg_implementation , description = ' Call a KittenGroomer implementation to process files present in the source directory and copy them to the destination directory. ' ) :
2015-05-11 14:32:59 +02:00
parser = argparse . ArgumentParser ( prog = ' KittenGroomer ' , description = description )
parser . add_argument ( ' -s ' , ' --source ' , type = str , help = ' Source directory ' )
parser . add_argument ( ' -d ' , ' --destination ' , type = str , help = ' Destination directory ' )
args = parser . parse_args ( )
kg = kg_implementation ( args . source , args . destination )
kg . processdir ( )