Merge pull request #624 from seamustuohy/fix-badly-encoded-emails

Attempt to decode utf-8-sig encoded emails.
pull/629/head
Raphaël Vinot 2020-09-09 15:02:18 +02:00 committed by GitHub
commit cab202e1da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 21 additions and 0 deletions

View File

@ -26,6 +26,9 @@ class EMailObject(AbstractMISPObjectGenerator):
else:
raise InvalidMISPObject('File buffer (BytesIO) or a path is required.')
self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default)
# Improperly encoded emails (utf-8-sig) fail silently. An empty email indicates this might be the case.
if len(self.__email) == 0:
self.attempt_decoding()
if attach_original_email:
self.add_attribute('eml', value='Full email.eml', data=self.__pseudofile)
self.generate_attributes()
@ -44,6 +47,24 @@ class EMailObject(AbstractMISPObjectGenerator):
to_return.append((attachment.get_filename(), BytesIO(content)))
return to_return
def attempt_decoding(self):
"""Attempt to decode non-ascii encoded emails.
"""
_msg_bytes = self.__pseudofile.getvalue()
try:
_msg_bytes.decode("ASCII")
logger.info("EmailObject failed to decode ASCII encoded email.")
return
except UnicodeDecodeError:
logger.debug("EmailObject was passed a non-ASCII encoded binary blob.")
try:
if _msg_bytes[:3] == b'\xef\xbb\xbf': # utf-8-sig byte-order mark (BOM)
# Set Pseudofile to correctly encoded email in case it is used at some later point.
self.__pseudofile = BytesIO(_msg_bytes.decode('utf_8_sig').encode("ASCII"))
self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default)
except UnicodeDecodeError:
logger.debug("EmailObject does not know how to decode binary blob passed to it. Object may not be an email. If this is an email please submit it as an issue to PyMISP so we can add support.")
def generate_attributes(self):
if self.__email.get_body(preferencelist=('html', 'plain')):
self.add_attribute('email-body', value=self.__email.get_body(preferencelist=('html', 'plain')).get_payload(decode=True).decode('utf8', 'surrogateescape'))