From 07137209e215cf32379643559fef84088c23b0ed Mon Sep 17 00:00:00 2001 From: seamus tuohy Date: Wed, 9 Sep 2020 07:45:07 -0400 Subject: [PATCH] Attempt to decode utf-8-sig encoded emails. eml files downloaded from Windows Online security on some Windows 11 systems are automatically encoded in UTF with a byte order mark (BOM) at the front of the file. This will cause the email parser to fail. This is a somewhat isolated problem. It only will affects a small subset of Windows users who download and re-upload eml files. But, this small subset of users is the target user-base for the MISP email module: low expertiese users who wish to quickly share high-value indicators on an ad-hoc basis. While this fix could be tacked onto the MISP email module instead of here, I beleive that this fix is more appropriate in the PyMISP object code. As the "email" object parser this object should be built to parse all manner of emails that it may encounter. This includes common malformations such as this one and, even horrors such as, the .msg format. This commit adds a generically named "attempt_decoding" function which can be expanded to address all manner of sins that are encountered in the future. --- pymisp/tools/emailobject.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pymisp/tools/emailobject.py b/pymisp/tools/emailobject.py index 74135e7..77f02b1 100644 --- a/pymisp/tools/emailobject.py +++ b/pymisp/tools/emailobject.py @@ -26,6 +26,9 @@ class EMailObject(AbstractMISPObjectGenerator): else: raise InvalidMISPObject('File buffer (BytesIO) or a path is required.') self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default) + # Improperly encoded emails (utf-8-sig) fail silently. An empty email indicates this might be the case. + if len(self.__email) == 0: + self.attempt_decoding() if attach_original_email: self.add_attribute('eml', value='Full email.eml', data=self.__pseudofile) self.generate_attributes() @@ -44,6 +47,24 @@ class EMailObject(AbstractMISPObjectGenerator): to_return.append((attachment.get_filename(), BytesIO(content))) return to_return + def attempt_decoding(self): + """Attempt to decode non-ascii encoded emails. + """ + _msg_bytes = self.__pseudofile.getvalue() + try: + _msg_bytes.decode("ASCII") + logger.info("EmailObject failed to decode ASCII encoded email.") + return + except UnicodeDecodeError: + logger.debug("EmailObject was passed a non-ASCII encoded binary blob.") + try: + if _msg_bytes[:3] == b'\xef\xbb\xbf': # utf-8-sig byte-order mark (BOM) + # Set Pseudofile to correctly encoded email in case it is used at some later point. + self.__pseudofile = BytesIO(_msg_bytes.decode('utf_8_sig').encode("ASCII")) + self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default) + except UnicodeDecodeError: + logger.debug("EmailObject does not know how to decode binary blob passed to it. Object may not be an email. If this is an email please submit it as an issue to PyMISP so we can add support.") + def generate_attributes(self): if self.__email.get_body(preferencelist=('html', 'plain')): self.add_attribute('email-body', value=self.__email.get_body(preferencelist=('html', 'plain')).get_payload(decode=True).decode('utf8', 'surrogateescape'))