From cc081a921d39f34624e079854f3d542e20770e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 11 Nov 2022 14:59:41 +0100 Subject: [PATCH] fix: Make sure the test suite passes --- filecheck/filecheck.py | 35 +++++++++++++++++++---------------- kittengroomer/helpers.py | 13 ++----------- poetry.lock | 6 +++--- pyproject.toml | 2 +- 4 files changed, 25 insertions(+), 31 deletions(-) diff --git a/filecheck/filecheck.py b/filecheck/filecheck.py index 8c61121..cc67ce5 100644 --- a/filecheck/filecheck.py +++ b/filecheck/filecheck.py @@ -11,7 +11,7 @@ import shutil import time import hashlib from pathlib import Path -from typing import Dict, List, Tuple, Callable, Optional +from typing import Dict, List, Tuple, Callable, Optional, Union import oletools.oleid # type: ignore import olefile # type: ignore @@ -34,7 +34,7 @@ class Config: mimes_rtf: Tuple[str, ...] = ('rtf', 'richtext',) mimes_pdf: Tuple[str, ...] = ('pdf', 'postscript',) mimes_xml: Tuple[str, ...] = ('xml',) - mimes_csv: Tuple[str, ...] = ('csv','text/csv') + mimes_csv: Tuple[str, ...] = ('csv', 'text/csv') mimes_ms: Tuple[str, ...] = ('dosexec',) mimes_compressed: Tuple[str, ...] = ('zip', 'rar', 'x-rar', 'bzip2', 'lzip', 'lzma', 'lzop', 'xz', 'compress', 'gzip', 'tar',) @@ -49,20 +49,20 @@ class Config: mimes_metadata: Tuple[str, ...] = ('image/jpeg', 'image/tiff', 'image/png',) # Mimetype aliases - aliases: Dict[str, str] = { + aliases: Dict[str, Union[str, List[str]]] = { # Win executables 'application/x-msdos-program': 'application/x-dosexec', 'application/x-dosexec': 'application/x-msdos-program', # Other apps with confusing mimetypes 'application/rtf': 'text/rtf', - 'application/rar': 'application/x-rar', + 'application/vnd.rar': 'application/x-rar', 'application/ogg': 'audio/ogg', 'audio/ogg': 'application/ogg' } # Mime Type / Extension fix. TODO: Doesn't quite work....???? - mimetypes.add_type('text/plain','.csv',False) - mimetypes.add_type('text/csv','.csv',False) + mimetypes.add_type('text/plain', '.csv', False) + mimetypes.add_type('text/csv', '.csv', False) mimetypes.add_type('application/vnd.apple.numbers', '.numbers', True) mimetypes.add_type('application/vnd.apple.pages', '.pages', False) mimetypes.add_type('application/vnd.apple.keynote', '.keynote', False) @@ -126,12 +126,12 @@ class Config: # In [12]: mimetypes.guess_type('toot.tar.gz', strict=False) # Out[12]: ('application/x-tar', 'gzip') # It works as expected if you do mimetypes.guess_type('application/gzip', strict=False) - override_ext: Dict[str, str] = {'.gz': 'application/gzip' - , '.csv': 'text/csv' #,'text/plain' ) - , '.numbers': 'application/vnd.apple.numbers' #,'application/zip') - , '.pages': 'application/vnd.apple.pages' #,'application/zip') - , '.keynote': 'application/vnd.apple.keynote' #,'application/zip') - } + override_ext: Dict[str, str] = {'.gz': 'application/gzip', + '.csv': 'text/csv', # ,'text/plain' ) + '.numbers': 'application/vnd.apple.numbers', # ,'application/zip') + '.pages': 'application/vnd.apple.pages', # ,'application/zip') + '.keynote': 'application/vnd.apple.keynote' # ,'application/zip') + } SEVENZ_PATH = '/usr/bin/7z' @@ -209,7 +209,10 @@ class File(FileBase): expected_mimetypes = [expected_mimetype] if expected_mimetype in Config.aliases: - expected_mimetypes.append(Config.aliases[expected_mimetype]) + if isinstance(Config.aliases[expected_mimetype], list): + expected_mimetypes += Config.aliases[expected_mimetype] + else: + expected_mimetypes.append(Config.aliases[expected_mimetype]) if (encoding is None) and (os.path.getsize(self.src_path) == 0): is_empty_file = True else: @@ -833,12 +836,12 @@ class KittenGroomerFileCheck(KittenGroomerBase): Performs a depth-first traversal of the file tree. """ - skipped_files = ( '.Trashes', '._.Trashes', '.DS_Store', '.fseventsd', '.Spotlight-V100','System Volume Information') + skipped_files = ('.Trashes', '._.Trashes', '.DS_Store', '.fseventsd', '.Spotlight-V100', 'System Volume Information') queue = [] for path in sorted(os.listdir(root_dir_path), key=lambda x: str.lower(x)): full_path = root_dir_path / path filename = full_path.name - if not filename in skipped_files and not filename.startswith('._'): + if filename not in skipped_files and not filename.startswith('._'): # check for symlinks first to prevent getting trapped in infinite symlink recursion if full_path.is_symlink(): queue.append(full_path) @@ -849,7 +852,7 @@ class KittenGroomerFileCheck(KittenGroomerBase): elif full_path.is_file(): queue.append(full_path) else: - print("SKIPPING: "+filename) + print(f"SKIPPING: {filename}") return queue def run(self): diff --git a/kittengroomer/helpers.py b/kittengroomer/helpers.py index 115b8be..653f536 100644 --- a/kittengroomer/helpers.py +++ b/kittengroomer/helpers.py @@ -243,17 +243,8 @@ class FileBase(object): mimetype = 'inode/symlink' self.set_property('symlink_path', os.readlink(file_path)) else: - try: - mt = magic.from_file(file_path, mime=True) - # libmagic always returns something, even if it's just 'data' - except UnicodeEncodeError as e: - self.add_error(e, '') - mt = None - try: - mimetype = mt.decode("utf-8") # type: ignore - except Exception: - # FIXME: what should the exception be if mimetype isn't utf-8? - mimetype = 'application/octet-stream' + # libmagic always returns something, even if it's just 'data' + mimetype = magic.from_file(file_path, mime=True) return mimetype def _split_mimetype(self, mimetype: str) -> Tuple[Union[str, None], Union[str, None]]: diff --git a/poetry.lock b/poetry.lock index 2ebd743..cec60d1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -197,9 +197,9 @@ lxml = "*" [package.source] type = "git" -url = "https://github.com/grierforensics/officedissector.git" +url = "https://github.com/Rafiot/officedissector.git" reference = "HEAD" -resolved_reference = "2059a5ba08fa139362e3936578f99c4da9a9b55d" +resolved_reference = "e3d9e8e155cc01180524c9b45b9fbec232206121" [[package]] name = "olefile" @@ -444,7 +444,7 @@ python-versions = "*" [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "31838fedc10762d58f75f2822a5a557756f2043f62ed70e3e3bb4ef806958a9b" +content-hash = "dd6a8563c8232219ba31a071d662bee519adaf95317f66ae81b2022269609510" [metadata.files] attrs = [ diff --git a/pyproject.toml b/pyproject.toml index f68ff7e..e41ce4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ pillow = "^9.3.0" olefile = "^0.46" oletools = "^0.60.1" python-magic = "^0.4.27" -officedissector = {git = "https://github.com/grierforensics/officedissector.git"} +officedissector = {git = "https://github.com/Rafiot/officedissector.git"} [tool.poetry.dev-dependencies] tox = "^3.27.0"