mirror of https://github.com/CIRCL/PyCIRCLean
Add new logger, move logging to filecheck
* Wrote a new text-based logger that displays all file information in the tree instead of using two separate logs * Stopped using twiggy since it wasn't giving us anything useful * Moved a lot of the logging code to filecheck, since it didn't really seem appropriate as an API. Left a Logging stub in kittengroomer to hold methods that might be useful for implementing other loggers. * For the new logger, had to change the way that we traverse the items in the source file tree.pull/14/head
parent
f0e7607a3f
commit
3f49612a23
144
bin/filecheck.py
144
bin/filecheck.py
|
@ -6,6 +6,7 @@ import shlex
|
||||||
import subprocess
|
import subprocess
|
||||||
import zipfile
|
import zipfile
|
||||||
import argparse
|
import argparse
|
||||||
|
import shutil
|
||||||
|
|
||||||
import oletools.oleid
|
import oletools.oleid
|
||||||
import olefile
|
import olefile
|
||||||
|
@ -17,7 +18,7 @@ from PIL import Image
|
||||||
# from PIL import PngImagePlugin
|
# from PIL import PngImagePlugin
|
||||||
from pdfid import PDFiD, cPDFiD
|
from pdfid import PDFiD, cPDFiD
|
||||||
|
|
||||||
from kittengroomer import FileBase, KittenGroomerBase, GroomerLogger
|
from kittengroomer import FileBase, KittenGroomerBase, Logging
|
||||||
|
|
||||||
|
|
||||||
SEVENZ_PATH = '/usr/bin/7z'
|
SEVENZ_PATH = '/usr/bin/7z'
|
||||||
|
@ -90,6 +91,7 @@ class File(FileBase):
|
||||||
super(File, self).__init__(src_path, dst_path)
|
super(File, self).__init__(src_path, dst_path)
|
||||||
self.is_recursive = False
|
self.is_recursive = False
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
|
self.tempdir_path = self.dst_path + '_temp'
|
||||||
|
|
||||||
subtypes_apps = [
|
subtypes_apps = [
|
||||||
(Config.mimes_office, self._winoffice),
|
(Config.mimes_office, self._winoffice),
|
||||||
|
@ -190,7 +192,12 @@ class File(FileBase):
|
||||||
|
|
||||||
def write_log(self):
|
def write_log(self):
|
||||||
props = self.get_all_props()
|
props = self.get_all_props()
|
||||||
self.logger.add_file(props)
|
if not self.is_recursive:
|
||||||
|
if os.path.exists(self.tempdir_path):
|
||||||
|
# Hack to make images appear at the correct tree depth in log
|
||||||
|
self.logger.add_file(self.src_path, props, in_tempdir=True)
|
||||||
|
return
|
||||||
|
self.logger.add_file(self.src_path, props)
|
||||||
|
|
||||||
# ##### Helper functions #####
|
# ##### Helper functions #####
|
||||||
def _make_method_dict(self, list_of_tuples):
|
def _make_method_dict(self, list_of_tuples):
|
||||||
|
@ -210,7 +217,6 @@ class File(FileBase):
|
||||||
|
|
||||||
def make_tempdir(self):
|
def make_tempdir(self):
|
||||||
"""Make a temporary directory at self.tempdir_path."""
|
"""Make a temporary directory at self.tempdir_path."""
|
||||||
self.tempdir_path = self.dst_path + '_temp'
|
|
||||||
if not os.path.exists(self.tempdir_path):
|
if not os.path.exists(self.tempdir_path):
|
||||||
os.makedirs(self.tempdir_path)
|
os.makedirs(self.tempdir_path)
|
||||||
return self.tempdir_path
|
return self.tempdir_path
|
||||||
|
@ -490,9 +496,93 @@ class File(FileBase):
|
||||||
except Exception as e: # Catch decompression bombs
|
except Exception as e: # Catch decompression bombs
|
||||||
# TODO: change this from all Exceptions to specific DecompressionBombWarning
|
# TODO: change this from all Exceptions to specific DecompressionBombWarning
|
||||||
self.add_error(e, "Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path))
|
self.add_error(e, "Caught exception (possible decompression bomb?) while translating file {}.".format(self.src_path))
|
||||||
self.make_dangerous()
|
self.make_dangerous('Image file containing decompression bomb')
|
||||||
self.add_description('Image file')
|
if not self.is_dangerous:
|
||||||
self.set_property('processing_type', 'image')
|
self.add_description('Image file')
|
||||||
|
|
||||||
|
|
||||||
|
class GroomerLogger(object):
|
||||||
|
"""Groomer logging interface."""
|
||||||
|
|
||||||
|
def __init__(self, src_root_path, dst_root_path, debug=False):
|
||||||
|
self._src_root_path = src_root_path
|
||||||
|
self._dst_root_path = dst_root_path
|
||||||
|
self._log_dir_path = self._make_log_dir(dst_root_path)
|
||||||
|
self.log_path = os.path.join(self._log_dir_path, 'circlean_log.txt')
|
||||||
|
self._add_root_dir(src_root_path)
|
||||||
|
if debug:
|
||||||
|
self.log_debug_err = os.path.join(self._log_dir_path, 'debug_stderr.log')
|
||||||
|
self.log_debug_out = os.path.join(self._log_dir_path, 'debug_stdout.log')
|
||||||
|
else:
|
||||||
|
self.log_debug_err = os.devnull
|
||||||
|
self.log_debug_out = os.devnull
|
||||||
|
|
||||||
|
def _make_log_dir(self, root_dir_path):
|
||||||
|
"""Make the directory in the dest dir that will hold the logs"""
|
||||||
|
log_dir_path = os.path.join(root_dir_path, 'logs')
|
||||||
|
if os.path.exists(log_dir_path):
|
||||||
|
shutil.rmtree(log_dir_path)
|
||||||
|
os.makedirs(log_dir_path)
|
||||||
|
return log_dir_path
|
||||||
|
|
||||||
|
def _add_root_dir(self, root_path):
|
||||||
|
dirname = os.path.split(root_path)[1] + '/'
|
||||||
|
with open(self.log_path, mode='ab') as lf:
|
||||||
|
lf.write(bytes(dirname, 'utf-8'))
|
||||||
|
lf.write(b'\n')
|
||||||
|
|
||||||
|
def add_file(self, file_path, file_props, in_tempdir=False):
|
||||||
|
"""Add a file to the log. Takes a dict of file properties."""
|
||||||
|
# TODO: fix var names in this method
|
||||||
|
# TODO: handle symlinks better: symlink_string = '{}+-- {}\t- Symbolic link to {}\n'.format(padding, f, os.readlink(curpath))
|
||||||
|
props = file_props
|
||||||
|
depth = self._get_path_depth(file_path)
|
||||||
|
description_string = ', '.join(props['description_string'])
|
||||||
|
file_hash = Logging.computehash(file_path)[:6]
|
||||||
|
if props['safety_category'] is None:
|
||||||
|
descr_cat = "Normal"
|
||||||
|
else:
|
||||||
|
descr_cat = props['safety_category'].capitalize()
|
||||||
|
# TODO: make size adjust to MB/GB for large files
|
||||||
|
size = str(props['file_size']) + 'B'
|
||||||
|
file_template = "+- {name} ({sha_hash}): {size}, {mt}/{st}. {desc}: {desc_str}"
|
||||||
|
file_string = file_template.format(
|
||||||
|
name=props['filename'],
|
||||||
|
sha_hash=file_hash,
|
||||||
|
size=size,
|
||||||
|
mt=props['maintype'],
|
||||||
|
st=props['subtype'],
|
||||||
|
desc=descr_cat,
|
||||||
|
desc_str=description_string,
|
||||||
|
# errs='' # TODO: add errors in human readable form here
|
||||||
|
)
|
||||||
|
if in_tempdir:
|
||||||
|
depth -= 1
|
||||||
|
self._write_line_to_log(file_string, depth)
|
||||||
|
|
||||||
|
def add_dir(self, dir_path):
|
||||||
|
path_depth = self._get_path_depth(dir_path)
|
||||||
|
dirname = os.path.split(dir_path)[1] + '/'
|
||||||
|
log_line = '+- ' + dirname
|
||||||
|
self._write_line_to_log(log_line, path_depth)
|
||||||
|
|
||||||
|
def _get_path_depth(self, path):
|
||||||
|
if self._dst_root_path in path:
|
||||||
|
base_path = self._dst_root_path
|
||||||
|
elif self._src_root_path in path:
|
||||||
|
base_path = self._src_root_path
|
||||||
|
relpath = os.path.relpath(path, base_path)
|
||||||
|
path_depth = relpath.count(os.path.sep)
|
||||||
|
return path_depth
|
||||||
|
|
||||||
|
def _write_line_to_log(self, line, indentation_depth):
|
||||||
|
# TODO: should we use fsencode and fsdecode here instead of just bytestrings?
|
||||||
|
padding = b' '
|
||||||
|
padding += b'| ' * indentation_depth
|
||||||
|
with open(self.log_path, mode='ab') as lf:
|
||||||
|
lf.write(padding)
|
||||||
|
lf.write(bytes(line, encoding='utf-8'))
|
||||||
|
lf.write(b'\n')
|
||||||
|
|
||||||
|
|
||||||
class KittenGroomerFileCheck(KittenGroomerBase):
|
class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
|
@ -502,20 +592,17 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
self.recursive_archive_depth = 0
|
self.recursive_archive_depth = 0
|
||||||
self.max_recursive_depth = max_recursive_depth
|
self.max_recursive_depth = max_recursive_depth
|
||||||
self.cur_file = None
|
self.cur_file = None
|
||||||
self.logger = GroomerLogger(self.dst_root_path, debug)
|
self.logger = GroomerLogger(root_src, root_dst, debug)
|
||||||
|
|
||||||
def process_dir(self, src_dir, dst_dir):
|
def process_dir(self, src_dir, dst_dir):
|
||||||
"""Process a directory on the source key."""
|
"""Process a directory on the source key."""
|
||||||
self.logger.tree(src_dir)
|
for srcpath in self.list_files_dirs(src_dir):
|
||||||
for srcpath in self.list_all_files(src_dir):
|
if os.path.isdir(srcpath):
|
||||||
dstpath = srcpath.replace(src_dir, dst_dir)
|
self.logger.add_dir(srcpath)
|
||||||
# TODO: Can we clean up the way we handle relative_path?
|
else:
|
||||||
# Relative path is here so that when we print files in the log it
|
dstpath = os.path.join(dst_dir, os.path.basename(srcpath))
|
||||||
# shows only the file's path. Should we just pass it to the logger
|
self.cur_file = File(srcpath, dstpath, self.logger)
|
||||||
# when we create it? Or let the logger figure it out?
|
self.process_file(self.cur_file)
|
||||||
# relative_path = srcpath.replace(src_dir + '/', '')
|
|
||||||
self.cur_file = File(srcpath, dstpath, self.logger)
|
|
||||||
self.process_file(self.cur_file)
|
|
||||||
|
|
||||||
def process_file(self, file):
|
def process_file(self, file):
|
||||||
"""
|
"""
|
||||||
|
@ -525,12 +612,13 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
the file to the destionation key, and clean up temporary directory.
|
the file to the destionation key, and clean up temporary directory.
|
||||||
"""
|
"""
|
||||||
file.check()
|
file.check()
|
||||||
if file.is_recursive:
|
if file.should_copy:
|
||||||
self.process_archive(file)
|
|
||||||
elif file.should_copy:
|
|
||||||
file.safe_copy()
|
file.safe_copy()
|
||||||
file.set_property('copied', True)
|
file.set_property('copied', True)
|
||||||
file.write_log()
|
file.write_log()
|
||||||
|
if file.is_recursive:
|
||||||
|
self.process_archive(file)
|
||||||
|
# TODO: Can probably handle cleaning up the tempdir better
|
||||||
if hasattr(file, 'tempdir_path'):
|
if hasattr(file, 'tempdir_path'):
|
||||||
self.safe_rmtree(file.tempdir_path)
|
self.safe_rmtree(file.tempdir_path)
|
||||||
|
|
||||||
|
@ -547,11 +635,12 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
else:
|
else:
|
||||||
tempdir_path = file.make_tempdir()
|
tempdir_path = file.make_tempdir()
|
||||||
# TODO: double check we are properly escaping file.src_path
|
# TODO: double check we are properly escaping file.src_path
|
||||||
# otherwise we are running unvalidated user input directly in the shell
|
# otherwise we are running unsanitized user input directly in the shell
|
||||||
command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
|
command_str = '{} -p1 x "{}" -o"{}" -bd -aoa'
|
||||||
unpack_command = command_str.format(SEVENZ_PATH,
|
unpack_command = command_str.format(SEVENZ_PATH,
|
||||||
file.src_path, tempdir_path)
|
file.src_path, tempdir_path)
|
||||||
self._run_process(unpack_command)
|
self._run_process(unpack_command)
|
||||||
|
file.write_log()
|
||||||
self.process_dir(tempdir_path, file.dst_path)
|
self.process_dir(tempdir_path, file.dst_path)
|
||||||
self.safe_rmtree(tempdir_path)
|
self.safe_rmtree(tempdir_path)
|
||||||
self.recursive_archive_depth -= 1
|
self.recursive_archive_depth -= 1
|
||||||
|
@ -566,6 +655,17 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
return
|
return
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def list_files_dirs(self, root_dir_path):
|
||||||
|
queue = []
|
||||||
|
for path in sorted(os.listdir(root_dir_path), key=lambda x: str.lower(x)):
|
||||||
|
full_path = os.path.join(root_dir_path, path)
|
||||||
|
if os.path.isdir(full_path):
|
||||||
|
queue.append(full_path)
|
||||||
|
queue += self.list_files_dirs(full_path) # if path is a dir, recurse through its contents
|
||||||
|
elif os.path.isfile(full_path):
|
||||||
|
queue.append(full_path)
|
||||||
|
return queue
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.process_dir(self.src_root_path, self.dst_root_path)
|
self.process_dir(self.src_root_path, self.dst_root_path)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from .helpers import FileBase, KittenGroomerBase, GroomerLogger, main
|
from .helpers import FileBase, KittenGroomerBase, Logging, main
|
||||||
|
|
|
@ -14,7 +14,6 @@ import shutil
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
# import twiggy
|
|
||||||
|
|
||||||
|
|
||||||
class KittenGroomerError(Exception):
|
class KittenGroomerError(Exception):
|
||||||
|
@ -286,52 +285,11 @@ class FileBase(object):
|
||||||
return ext
|
return ext
|
||||||
|
|
||||||
|
|
||||||
class GroomerLogger(object):
|
class Logging(object):
|
||||||
"""Groomer logging interface."""
|
|
||||||
|
|
||||||
def __init__(self, root_dir_path, debug=False):
|
@staticmethod
|
||||||
self._root_dir_path = root_dir_path
|
def computehash(path):
|
||||||
self._log_dir_path = self._make_log_dir(root_dir_path)
|
"""Return the sha256 hash of a file at a given path."""
|
||||||
# LOG: rename logfile to something more descriptive
|
|
||||||
self.log_path = os.path.join(self._log_dir_path, 'log.txt')
|
|
||||||
# twiggy.quick_setup(file=self.log_processing)
|
|
||||||
# self.log = twiggy.log.name('files')
|
|
||||||
if debug:
|
|
||||||
self.log_debug_err = os.path.join(self._log_dir_path, 'debug_stderr.log')
|
|
||||||
self.log_debug_out = os.path.join(self._log_dir_path, 'debug_stdout.log')
|
|
||||||
else:
|
|
||||||
self.log_debug_err = os.devnull
|
|
||||||
self.log_debug_out = os.devnull
|
|
||||||
|
|
||||||
def _make_log_dir(self, root_dir_path):
|
|
||||||
log_dir_path = os.path.join(root_dir_path, 'logs')
|
|
||||||
if os.path.exists(log_dir_path):
|
|
||||||
shutil.rmtree(log_dir_path)
|
|
||||||
os.makedirs(log_dir_path)
|
|
||||||
return log_dir_path
|
|
||||||
|
|
||||||
def tree(self, base_dir, padding=' '):
|
|
||||||
"""Write a graphical tree to the log for `base_dir`."""
|
|
||||||
horizontal_divider = '#' * 80 + '\n'
|
|
||||||
with open(self.log_path, mode='a', encoding='utf-8') as lf:
|
|
||||||
lf.write(horizontal_divider)
|
|
||||||
base_dir_name = os.path.basename(os.path.abspath(base_dir))
|
|
||||||
lf.write('{}+- {}/\n'.format(padding, base_dir_name))
|
|
||||||
padding += '| '
|
|
||||||
# TODO: make sure this gets all sub directories and use scandir()
|
|
||||||
files = sorted(os.listdir(base_dir))
|
|
||||||
for f in files:
|
|
||||||
curpath = os.path.join(base_dir, f)
|
|
||||||
if os.path.islink(curpath):
|
|
||||||
lf.write('{}+-- {}\t- Symbolic link to {}\n'.format(padding, f, os.readlink(curpath)))
|
|
||||||
elif os.path.isdir(curpath):
|
|
||||||
self.tree(curpath, padding)
|
|
||||||
elif os.path.isfile(curpath):
|
|
||||||
lf.write('{}+-- {}\t- {}\n'.format(padding, f, self._computehash(curpath)))
|
|
||||||
lf.write(horizontal_divider)
|
|
||||||
|
|
||||||
def _computehash(self, path):
|
|
||||||
"""Return a sha256 hash of a file at a given path."""
|
|
||||||
s = hashlib.sha256()
|
s = hashlib.sha256()
|
||||||
with open(path, 'rb') as f:
|
with open(path, 'rb') as f:
|
||||||
while True:
|
while True:
|
||||||
|
@ -341,23 +299,6 @@ class GroomerLogger(object):
|
||||||
s.update(buf)
|
s.update(buf)
|
||||||
return s.hexdigest()
|
return s.hexdigest()
|
||||||
|
|
||||||
def _write_file_to_disk(self, file_string):
|
|
||||||
with open(self.log_path, mode='a', encoding='utf-8') as lf:
|
|
||||||
lf.write(file_string)
|
|
||||||
|
|
||||||
def add_file(self, file_props):
|
|
||||||
"""Add a file to the log. Takes a dict of file properties."""
|
|
||||||
props = file_props
|
|
||||||
description_string = ', '.join(props['description_string'])
|
|
||||||
file_string = " * {}: {}/{}, {}: {}\n".format(
|
|
||||||
props['filename'],
|
|
||||||
props['maintype'],
|
|
||||||
props['subtype'],
|
|
||||||
props['safety_category'],
|
|
||||||
description_string
|
|
||||||
)
|
|
||||||
self._write_file_to_disk(file_string)
|
|
||||||
|
|
||||||
|
|
||||||
class KittenGroomerBase(object):
|
class KittenGroomerBase(object):
|
||||||
"""Base object responsible for copy/sanitization process."""
|
"""Base object responsible for copy/sanitization process."""
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from kittengroomer import FileBase, KittenGroomerBase, GroomerLogger
|
from kittengroomer import FileBase, KittenGroomerBase
|
||||||
|
|
||||||
skip = pytest.mark.skip
|
skip = pytest.mark.skip
|
||||||
xfail = pytest.mark.xfail
|
xfail = pytest.mark.xfail
|
||||||
|
@ -216,12 +216,7 @@ class TestFileBase:
|
||||||
|
|
||||||
class TestLogger:
|
class TestLogger:
|
||||||
|
|
||||||
@fixture
|
pass
|
||||||
def generic_logger(self, tmpdir):
|
|
||||||
return GroomerLogger(tmpdir.strpath)
|
|
||||||
|
|
||||||
def test_tree(self, generic_logger, tmpdir):
|
|
||||||
generic_logger.tree(tmpdir.strpath)
|
|
||||||
|
|
||||||
|
|
||||||
class TestKittenGroomerBase:
|
class TestKittenGroomerBase:
|
||||||
|
|
Loading…
Reference in New Issue