Merge pull request #26 from bofrese/ignore_system_files

Ignoring system files and directories. Temporary fix for CSV files
pull/28/head
Raphaël Vinot 2020-09-08 12:33:47 +02:00 committed by GitHub
commit 12eecb1438
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 34 additions and 9 deletions

View File

@ -34,6 +34,7 @@ class Config:
mimes_rtf: Tuple[str, ...] = ('rtf', 'richtext',) mimes_rtf: Tuple[str, ...] = ('rtf', 'richtext',)
mimes_pdf: Tuple[str, ...] = ('pdf', 'postscript',) mimes_pdf: Tuple[str, ...] = ('pdf', 'postscript',)
mimes_xml: Tuple[str, ...] = ('xml',) mimes_xml: Tuple[str, ...] = ('xml',)
mimes_csv: Tuple[str, ...] = ('csv','text/csv')
mimes_ms: Tuple[str, ...] = ('dosexec',) mimes_ms: Tuple[str, ...] = ('dosexec',)
mimes_compressed: Tuple[str, ...] = ('zip', 'rar', 'x-rar', 'bzip2', 'lzip', 'lzma', 'lzop', mimes_compressed: Tuple[str, ...] = ('zip', 'rar', 'x-rar', 'bzip2', 'lzip', 'lzma', 'lzop',
'xz', 'compress', 'gzip', 'tar',) 'xz', 'compress', 'gzip', 'tar',)
@ -59,6 +60,13 @@ class Config:
'audio/ogg': 'application/ogg' 'audio/ogg': 'application/ogg'
} }
# Mime Type / Extension fix. TODO: Doesn't quite work....????
mimetypes.add_type('text/plain','.csv',False)
mimetypes.add_type('text/csv','.csv',False)
mimetypes.add_type('application/vnd.apple.numbers', '.numbers', True)
mimetypes.add_type('application/vnd.apple.pages', '.pages', False)
mimetypes.add_type('application/vnd.apple.keynote', '.keynote', False)
# EXTS # EXTS
# Commonly used malicious extensions # Commonly used malicious extensions
# Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/ # Sources: http://www.howtogeek.com/137270/50-file-extensions-that-are-potentially-dangerous-on-windows/
@ -118,7 +126,12 @@ class Config:
# In [12]: mimetypes.guess_type('toot.tar.gz', strict=False) # In [12]: mimetypes.guess_type('toot.tar.gz', strict=False)
# Out[12]: ('application/x-tar', 'gzip') # Out[12]: ('application/x-tar', 'gzip')
# It works as expected if you do mimetypes.guess_type('application/gzip', strict=False) # It works as expected if you do mimetypes.guess_type('application/gzip', strict=False)
override_ext: Dict[str, str] = {'.gz': 'application/gzip'} override_ext: Dict[str, str] = {'.gz': 'application/gzip'
, '.csv': 'text/csv' #,'text/plain' )
, '.numbers': 'application/vnd.apple.numbers' #,'application/zip')
, '.pages': 'application/vnd.apple.pages' #,'application/zip')
, '.keynote': 'application/vnd.apple.keynote' #,'application/zip')
}
SEVENZ_PATH = '/usr/bin/7z' SEVENZ_PATH = '/usr/bin/7z'
@ -144,6 +157,7 @@ class File(FileBase):
(Config.mimes_libreoffice, self._libreoffice), (Config.mimes_libreoffice, self._libreoffice),
(Config.mimes_pdf, self._pdf), (Config.mimes_pdf, self._pdf),
(Config.mimes_xml, self.text), (Config.mimes_xml, self.text),
(Config.mimes_csv, self.text),
(Config.mimes_ms, self._executables), (Config.mimes_ms, self._executables),
(Config.mimes_compressed, self._archive), (Config.mimes_compressed, self._archive),
(Config.mimes_data, self._binary_app), (Config.mimes_data, self._binary_app),
@ -188,6 +202,7 @@ class File(FileBase):
if self.extension in Config.override_ext: if self.extension in Config.override_ext:
expected_mimetypes = Config.override_ext[self.extension] expected_mimetypes = Config.override_ext[self.extension]
encoding = None encoding = None
self.mimetype = expected_mimetypes
else: else:
expected_mimetype, encoding = mimetypes.guess_type(str(self.src_path), expected_mimetype, encoding = mimetypes.guess_type(str(self.src_path),
strict=False) strict=False)
@ -377,6 +392,10 @@ class File(FileBase):
if mt in self.subtype: if mt in self.subtype:
self._ooxml() self._ooxml()
return return
for mt in Config.mimes_csv:
if mt in self.subtype:
self.add_description('CSV file')
return
self.add_description('Plain text file') self.add_description('Plain text file')
self.force_ext('.txt') self.force_ext('.txt')
@ -814,17 +833,23 @@ class KittenGroomerFileCheck(KittenGroomerBase):
Performs a depth-first traversal of the file tree. Performs a depth-first traversal of the file tree.
""" """
skipped_files = ( '.Trashes', '._.Trashes', '.DS_Store', '.fseventsd', '.Spotlight-V100','System Volume Information')
queue = [] queue = []
for path in sorted(os.listdir(root_dir_path), key=lambda x: str.lower(x)): for path in sorted(os.listdir(root_dir_path), key=lambda x: str.lower(x)):
full_path = root_dir_path / path full_path = root_dir_path / path
filename = full_path.name
if not filename in skipped_files and not filename.startswith('._'):
# check for symlinks first to prevent getting trapped in infinite symlink recursion # check for symlinks first to prevent getting trapped in infinite symlink recursion
if full_path.is_symlink(): if full_path.is_symlink():
queue.append(full_path) queue.append(full_path)
elif full_path.is_dir(): elif full_path.is_dir():
# Skip hidden and special directories.
queue.append(full_path) queue.append(full_path)
queue += self.list_files_dirs(full_path) queue += self.list_files_dirs(full_path)
elif full_path.is_file(): elif full_path.is_file():
queue.append(full_path) queue.append(full_path)
else:
print("SKIPPING: "+filename)
return queue return queue
def run(self): def run(self):