mirror of https://github.com/CIRCL/PyCIRCLean
Proper handling of OOXML docs
parent
aaad11b5c1
commit
e8de330d34
|
@ -171,7 +171,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
(mimes_png, self._metadata_png),
|
(mimes_png, self._metadata_png),
|
||||||
]
|
]
|
||||||
self.metadata_processing_options = self._init_subtypes_application(types_metadata)
|
self.metadata_processing_options = self._init_subtypes_application(types_metadata)
|
||||||
|
|
||||||
self.mime_processing_options = {
|
self.mime_processing_options = {
|
||||||
'text': self.text,
|
'text': self.text,
|
||||||
'audio': self.audio,
|
'audio': self.audio,
|
||||||
|
@ -274,17 +274,21 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
|
|
||||||
# ##### Converted ######
|
# ##### Converted ######
|
||||||
def text(self):
|
def text(self):
|
||||||
''' LibreOffice should be able to open all the files '''
|
|
||||||
for r in mimes_rtf:
|
for r in mimes_rtf:
|
||||||
if r in self.cur_file.sub_type:
|
if r in self.cur_file.sub_type:
|
||||||
self.cur_file.log_string += 'Rich Text file'
|
self.cur_file.log_string += 'Rich Text file'
|
||||||
# TODO: need a way to convert it to plain text
|
# TODO: need a way to convert it to plain text
|
||||||
self.cur_file.force_ext('.txt')
|
self.cur_file.force_ext('.txt')
|
||||||
self._safe_copy()
|
self._safe_copy()
|
||||||
else:
|
return
|
||||||
self.cur_file.log_string += 'Text file'
|
for o in mimes_ooxml:
|
||||||
self.cur_file.force_ext('.txt')
|
if o in self.cur_file.sub_type:
|
||||||
self._safe_copy()
|
self.cur_file.log_string += 'OOXML File'
|
||||||
|
self._ooxml()
|
||||||
|
return
|
||||||
|
self.cur_file.log_string += 'Text file'
|
||||||
|
self.cur_file.force_ext('.txt')
|
||||||
|
self._safe_copy()
|
||||||
|
|
||||||
def application(self):
|
def application(self):
|
||||||
''' Everything can be there, using the subtype to decide '''
|
''' Everything can be there, using the subtype to decide '''
|
||||||
|
@ -428,7 +432,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
def _metadata_exif(self, metadataFile):
|
def _metadata_exif(self, metadataFile):
|
||||||
img = open(self.cur_file.src_path, 'rb')
|
img = open(self.cur_file.src_path, 'rb')
|
||||||
tags = None
|
tags = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tags = exifread.process_file(img, debug=True)
|
tags = exifread.process_file(img, debug=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -442,7 +446,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
print(e)
|
print(e)
|
||||||
img.close()
|
img.close()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for tag in sorted(tags.keys()):
|
for tag in sorted(tags.keys()):
|
||||||
# These are long and obnoxious/binary
|
# These are long and obnoxious/binary
|
||||||
if tag not in ('JPEGThumbnail', 'TIFFThumbnail'):
|
if tag not in ('JPEGThumbnail', 'TIFFThumbnail'):
|
||||||
|
@ -493,7 +497,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
self.cur_file.log_string += 'Audio file'
|
self.cur_file.log_string += 'Audio file'
|
||||||
self._media_processing()
|
self._media_processing()
|
||||||
|
|
||||||
|
|
||||||
def image(self):
|
def image(self):
|
||||||
'''Way to process an image'''
|
'''Way to process an image'''
|
||||||
if self.cur_file.has_metadata():
|
if self.cur_file.has_metadata():
|
||||||
|
@ -516,7 +520,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
#Copy the file back out and cleanup
|
#Copy the file back out and cleanup
|
||||||
self._safe_copy(tmppath)
|
self._safe_copy(tmppath)
|
||||||
self._safe_rmtree(tmpdir)
|
self._safe_rmtree(tmpdir)
|
||||||
|
|
||||||
# Catch decompression bombs
|
# Catch decompression bombs
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.cur_file.src_path))
|
print("Caught exception (possible decompression bomb?) while translating file {}.".format(self.cur_file.src_path))
|
||||||
|
|
Loading…
Reference in New Issue