mirror of https://github.com/CIRCL/PyCIRCLean
Move to PIL for EXIF; add PNG metadata extractor; modularize metadata extraction
Switch back to exifread; PIL's EXIF support sucks.pull/2/head^2
parent
ca90a08159
commit
53b61d487e
|
@ -12,8 +12,9 @@ import olefile
|
||||||
import officedissector
|
import officedissector
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
import exifread
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from PIL.ExifTags import TAGS, GPSTAGS
|
||||||
|
from PIL import PngImagePlugin
|
||||||
|
|
||||||
from pdfid import PDFiD, cPDFiD
|
from pdfid import PDFiD, cPDFiD
|
||||||
|
|
||||||
|
@ -35,7 +36,11 @@ mimes_compressed = ['zip', 'rar', 'bzip2', 'lzip', 'lzma', 'lzop',
|
||||||
mimes_data = ['octet-stream']
|
mimes_data = ['octet-stream']
|
||||||
|
|
||||||
# Prepare image/<subtype>
|
# Prepare image/<subtype>
|
||||||
mimes_metadata = ['jpeg', 'tiff']
|
mimes_exif = ['image/jpeg', 'image/tiff']
|
||||||
|
mimes_png = ['image/png']
|
||||||
|
|
||||||
|
# Mime types we can pull metadata from
|
||||||
|
mimes_metadata = ['image/jpeg', 'image/tiff', 'image/png']
|
||||||
|
|
||||||
# Aliases
|
# Aliases
|
||||||
aliases = {
|
aliases = {
|
||||||
|
@ -127,8 +132,8 @@ class File(FileBase):
|
||||||
# there are no known extensions associated to this mimetype.
|
# there are no known extensions associated to this mimetype.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def has_image_metadata(self):
|
def has_metadata(self):
|
||||||
if self.sub_type in mimes_metadata:
|
if self.mimetype in mimes_metadata:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -161,6 +166,12 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
]
|
]
|
||||||
self.subtypes_application = self._init_subtypes_application(subtypes_apps)
|
self.subtypes_application = self._init_subtypes_application(subtypes_apps)
|
||||||
|
|
||||||
|
types_metadata = [
|
||||||
|
(mimes_exif, self._metadata_exif),
|
||||||
|
(mimes_png, self._metadata_png),
|
||||||
|
]
|
||||||
|
self.metadata_processing_options = self._init_subtypes_application(types_metadata)
|
||||||
|
|
||||||
self.mime_processing_options = {
|
self.mime_processing_options = {
|
||||||
'text': self.text,
|
'text': self.text,
|
||||||
'audio': self.audio,
|
'audio': self.audio,
|
||||||
|
@ -413,27 +424,56 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
self._safe_copy()
|
self._safe_copy()
|
||||||
|
|
||||||
#######################
|
#######################
|
||||||
|
# Metadata extractors
|
||||||
|
def _metadata_exif(self, metadataFile):
|
||||||
|
img = Image.open(self.cur_file.src_path)
|
||||||
|
exif = img._getexif().items()
|
||||||
|
md = {}
|
||||||
|
|
||||||
|
for tag, value in exif:
|
||||||
|
print(tag)
|
||||||
|
print(value)
|
||||||
|
decoded = TAGS[tag]
|
||||||
|
if "GPSInfo" == decoded:
|
||||||
|
for t in value:
|
||||||
|
md[GPSTAGS[t]] = value[t]
|
||||||
|
else:
|
||||||
|
md[decoded] = value
|
||||||
|
|
||||||
|
for tag in sorted(md.keys()):
|
||||||
|
if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'EXIF MakerNote'):
|
||||||
|
metadataFile.write("Key: {}\tValue: {}\n".format(tag, md[tag]))
|
||||||
|
self.cur_file.add_log_details('metadata', 'exif')
|
||||||
|
img.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _metadata_png(self, metadataFile):
|
||||||
|
img = Image.open(self.cur_file.src_path)
|
||||||
|
for tag in sorted(img.info.keys()):
|
||||||
|
metadataFile.write("Key: {}\tValue: {}\n".format(tag, img.info[tag]))
|
||||||
|
self.cur_file.add_log_details('metadata', 'png')
|
||||||
|
img.close()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata(self):
|
||||||
|
metadataFile = self._safe_metadata_split(".metadata.txt")
|
||||||
|
self.metadata_processing_options.get(self.cur_file.mimetype)(metadataFile)
|
||||||
|
metadataFile.close()
|
||||||
|
|
||||||
|
#######################
|
||||||
# ##### Not converted, checking the mime type ######
|
# ##### Not converted, checking the mime type ######
|
||||||
def audio(self):
|
def audio(self):
|
||||||
'''Way to process an audio file'''
|
'''Way to process an audio file'''
|
||||||
self.cur_file.log_string += 'Audio file'
|
self.cur_file.log_string += 'Audio file'
|
||||||
self._media_processing()
|
self._media_processing()
|
||||||
|
|
||||||
|
|
||||||
def image(self):
|
def image(self):
|
||||||
'''Way to process an image'''
|
'''Way to process an image'''
|
||||||
# Extract the metadata
|
if self.cur_file.has_metadata():
|
||||||
if self.cur_file.has_image_metadata():
|
self.extract_metadata()
|
||||||
metadataFile = self._safe_metadata_split(".exif")
|
|
||||||
f = open(self.cur_file.src_path, 'rb')
|
|
||||||
tags = exifread.process_file(f) # TODO: Switch to PyExifTool for raw, etc. support?
|
|
||||||
for tag in sorted(tags.keys()):
|
|
||||||
if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'EXIF MakerNote'):
|
|
||||||
metadataFile.write("Key: {}\tValue: {}\n".format(tag, tags[tag]))
|
|
||||||
metadataFile.close()
|
|
||||||
f.close()
|
|
||||||
self.cur_file.add_log_details('metadata', 'exif')
|
|
||||||
|
|
||||||
|
## FIXME make sure this works for png, gif, tiff
|
||||||
# Create a temp directory
|
# Create a temp directory
|
||||||
dst_dir, filename = os.path.split(self.cur_file.dst_path)
|
dst_dir, filename = os.path.split(self.cur_file.dst_path)
|
||||||
tmpdir = os.path.join(dst_dir, 'temp')
|
tmpdir = os.path.join(dst_dir, 'temp')
|
||||||
|
@ -442,6 +482,7 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
|
|
||||||
# Do our image conversions
|
# Do our image conversions
|
||||||
warnings.simplefilter('error', Image.DecompressionBombWarning)
|
warnings.simplefilter('error', Image.DecompressionBombWarning)
|
||||||
|
try:
|
||||||
imIn = Image.open(self.cur_file.src_path)
|
imIn = Image.open(self.cur_file.src_path)
|
||||||
imOut = Image.frombytes(imIn.mode, imIn.size, imIn.tobytes())
|
imOut = Image.frombytes(imIn.mode, imIn.size, imIn.tobytes())
|
||||||
imOut.save(tmppath)
|
imOut.save(tmppath)
|
||||||
|
@ -450,6 +491,12 @@ class KittenGroomerFileCheck(KittenGroomerBase):
|
||||||
self._safe_copy(tmppath)
|
self._safe_copy(tmppath)
|
||||||
self._safe_rmtree(tmpdir)
|
self._safe_rmtree(tmpdir)
|
||||||
|
|
||||||
|
# Catch decompression bombs
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
self.cur_file.make_dangerous()
|
||||||
|
self._safe_copy()
|
||||||
|
|
||||||
self.cur_file.log_string += 'Image file'
|
self.cur_file.log_string += 'Image file'
|
||||||
self.cur_file.add_log_details('processing_type', 'image')
|
self.cur_file.add_log_details('processing_type', 'image')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue