diff --git a/poetry.lock b/poetry.lock index 2bc3bb4..1b8812b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -284,6 +284,14 @@ zipp = ">=0.5" docs = ["sphinx", "rst.linker"] testing = ["packaging", "pep517", "importlib-resources (>=1.3)"] +[[package]] +name = "ipaddress" +version = "1.0.23" +description = "IPv4/IPv6 manipulation library" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "ipykernel" version = "5.3.4" @@ -484,6 +492,19 @@ category = "main" optional = true python-versions = ">=2.7" +[[package]] +name = "mail-parser" +version = "3.12.0" +description = "Wrapper for email standard library" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +ipaddress = "1.0.23" +simplejson = "3.17.0" +six = "1.14.0" + [[package]] name = "markupsafe" version = "1.1.1" @@ -913,9 +934,17 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "simplejson" +version = "3.17.0" +description = "Simple, fast, extensible JSON encoder/decoder for Python" +category = "main" +optional = false +python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*" + [[package]] name = "six" -version = "1.15.0" +version = "1.14.0" description = "Python 2 and 3 compatibility utilities" category = "main" optional = false @@ -1193,7 +1222,7 @@ virustotal = ["validators"] [metadata] lock-version = "1.1" python-versions = "^3.6" -content-hash = "a2bf3a2d2162cc76563904258ac8b667801f14c3f3ff9df310b4d5c23d4e13d9" +content-hash = "23aa8f0499f0012761ac2f91c02c2ad02a3a4fb53dd57bdcdca5db2b32b54634" [metadata.files] alabaster = [ @@ -1384,6 +1413,10 @@ importlib-metadata = [ {file = "importlib_metadata-2.0.0-py2.py3-none-any.whl", hash = "sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3"}, {file = "importlib_metadata-2.0.0.tar.gz", hash = "sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da"}, ] +ipaddress = [ + {file = "ipaddress-1.0.23-py2.py3-none-any.whl", hash = "sha256:6e0f4a39e66cb5bb9a137b00276a2eff74f93b71dcbdad6f10ff7df9d3557fcc"}, + {file = "ipaddress-1.0.23.tar.gz", hash = "sha256:b7f8e0369580bb4a24d5ba1d7cc29660a4a6987763faf1d8a8046830e020e7e2"}, +] ipykernel = [ {file = "ipykernel-5.3.4-py3-none-any.whl", hash = "sha256:d6fbba26dba3cebd411382bc484f7bc2caa98427ae0ddb4ab37fe8bfeb5c7dd3"}, {file = "ipykernel-5.3.4.tar.gz", hash = "sha256:9b2652af1607986a1b231c62302d070bc0534f564c393a5d9d130db9abbbe89d"}, @@ -1448,6 +1481,10 @@ lief = [ {file = "lief-0.10.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:276cc63ec12a21bdf01b8d30962692c17499788234f0765247ca7a35872097ec"}, {file = "lief-0.10.1.tar.gz", hash = "sha256:a487fe7234c04bccd58223dbb79214421176e2629814c7a4a887764cceb5be7c"}, ] +mail-parser = [ + {file = "mail-parser-3.12.0.tar.gz", hash = "sha256:e8ff4ac4b27d4a0a87fe69cdaca9a9123f9662b28991b3b838e449a779345214"}, + {file = "mail_parser-3.12.0-py3-none-any.whl", hash = "sha256:b948e2905ae1f8823b2b2b3acaca8595d959cf73ca89e2bc86220b895f7af4d2"}, +] markupsafe = [ {file = "MarkupSafe-1.1.1-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161"}, {file = "MarkupSafe-1.1.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"}, @@ -1749,9 +1786,39 @@ send2trash = [ {file = "Send2Trash-1.5.0-py3-none-any.whl", hash = "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"}, {file = "Send2Trash-1.5.0.tar.gz", hash = "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2"}, ] +simplejson = [ + {file = "simplejson-3.17.0-cp27-cp27m-macosx_10_13_x86_64.whl", hash = "sha256:87d349517b572964350cc1adc5a31b493bbcee284505e81637d0174b2758ba17"}, + {file = "simplejson-3.17.0-cp27-cp27m-win32.whl", hash = "sha256:1d1e929cdd15151f3c0b2efe953b3281b2fd5ad5f234f77aca725f28486466f6"}, + {file = "simplejson-3.17.0-cp27-cp27m-win_amd64.whl", hash = "sha256:1ea59f570b9d4916ae5540a9181f9c978e16863383738b69a70363bc5e63c4cb"}, + {file = "simplejson-3.17.0-cp33-cp33m-win32.whl", hash = "sha256:8027bd5f1e633eb61b8239994e6fc3aba0346e76294beac22a892eb8faa92ba1"}, + {file = "simplejson-3.17.0-cp33-cp33m-win_amd64.whl", hash = "sha256:22a7acb81968a7c64eba7526af2cf566e7e2ded1cb5c83f0906b17ff1540f866"}, + {file = "simplejson-3.17.0-cp34-cp34m-win32.whl", hash = "sha256:17163e643dbf125bb552de17c826b0161c68c970335d270e174363d19e7ea882"}, + {file = "simplejson-3.17.0-cp34-cp34m-win_amd64.whl", hash = "sha256:0fe3994207485efb63d8f10a833ff31236ed27e3b23dadd0bf51c9900313f8f2"}, + {file = "simplejson-3.17.0-cp35-cp35m-win32.whl", hash = "sha256:4cf91aab51b02b3327c9d51897960c554f00891f9b31abd8a2f50fd4a0071ce8"}, + {file = "simplejson-3.17.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fc9051d249dd5512e541f20330a74592f7a65b2d62e18122ca89bf71f94db748"}, + {file = "simplejson-3.17.0-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:86afc5b5cbd42d706efd33f280fec7bd7e2772ef54e3f34cf6b30777cd19a614"}, + {file = "simplejson-3.17.0-cp36-cp36m-win32.whl", hash = "sha256:926bcbef9eb60e798eabda9cd0bbcb0fca70d2779aa0aa56845749d973eb7ad5"}, + {file = "simplejson-3.17.0-cp36-cp36m-win_amd64.whl", hash = "sha256:daaf4d11db982791be74b23ff4729af2c7da79316de0bebf880fa2d60bcc8c5a"}, + {file = "simplejson-3.17.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:9a126c3a91df5b1403e965ba63b304a50b53d8efc908a8c71545ed72535374a3"}, + {file = "simplejson-3.17.0-cp37-cp37m-win32.whl", hash = "sha256:fc046afda0ed8f5295212068266c92991ab1f4a50c6a7144b69364bdee4a0159"}, + {file = "simplejson-3.17.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7cce4bac7e0d66f3a080b80212c2238e063211fe327f98d764c6acbc214497fc"}, + {file = "simplejson-3.17.0.tar.gz", hash = "sha256:2b4b2b738b3b99819a17feaf118265d0753d5536049ea570b3c43b51c4701e81"}, + {file = "simplejson-3.17.0.win-amd64-py2.7.exe", hash = "sha256:1d346c2c1d7dd79c118f0cc7ec5a1c4127e0c8ffc83e7b13fc5709ff78c9bb84"}, + {file = "simplejson-3.17.0.win-amd64-py3.3.exe", hash = "sha256:5cfd495527f8b85ce21db806567de52d98f5078a8e9427b18e251c68bd573a26"}, + {file = "simplejson-3.17.0.win-amd64-py3.4.exe", hash = "sha256:8de378d589eccbc75941e480b4d5b4db66f22e4232f87543b136b1f093fff342"}, + {file = "simplejson-3.17.0.win-amd64-py3.5.exe", hash = "sha256:f4b64a1031acf33e281fd9052336d6dad4d35eee3404c95431c8c6bc7a9c0588"}, + {file = "simplejson-3.17.0.win-amd64-py3.6.exe", hash = "sha256:ad8dd3454d0c65c0f92945ac86f7b9efb67fa2040ba1b0189540e984df904378"}, + {file = "simplejson-3.17.0.win-amd64-py3.7.exe", hash = "sha256:229edb079d5dd81bf12da952d4d825bd68d1241381b37d3acf961b384c9934de"}, + {file = "simplejson-3.17.0.win32-py2.7.exe", hash = "sha256:4fd5f79590694ebff8dc980708e1c182d41ce1fda599a12189f0ca96bf41ad70"}, + {file = "simplejson-3.17.0.win32-py3.3.exe", hash = "sha256:d140e9376e7f73c1f9e0a8e3836caf5eec57bbafd99259d56979da05a6356388"}, + {file = "simplejson-3.17.0.win32-py3.4.exe", hash = "sha256:da00675e5e483ead345429d4f1374ab8b949fba4429d60e71ee9d030ced64037"}, + {file = "simplejson-3.17.0.win32-py3.5.exe", hash = "sha256:7739940d68b200877a15a5ff5149e1599737d6dd55e302625650629350466418"}, + {file = "simplejson-3.17.0.win32-py3.6.exe", hash = "sha256:60aad424e47c5803276e332b2a861ed7a0d46560e8af53790c4c4fb3420c26c2"}, + {file = "simplejson-3.17.0.win32-py3.7.exe", hash = "sha256:1fbba86098bbfc1f85c5b69dc9a6d009055104354e0d9880bb00b692e30e0078"}, +] six = [ - {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, - {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, + {file = "six-1.14.0-py2.py3-none-any.whl", hash = "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"}, + {file = "six-1.14.0.tar.gz", hash = "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a"}, ] snowballstemmer = [ {file = "snowballstemmer-2.0.0-py2.py3-none-any.whl", hash = "sha256:209f257d7533fdb3cb73bdbd24f436239ca3b2fa67d56f6ff88e86be08cc5ef0"}, diff --git a/pymisp/tools/emailobject.py b/pymisp/tools/emailobject.py index ffc8b76..303c26e 100644 --- a/pymisp/tools/emailobject.py +++ b/pymisp/tools/emailobject.py @@ -1,121 +1,211 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - +import os +from email import policy +from email.message import EmailMessage +from io import BytesIO +from pathlib import Path +from typing import Union, List, Tuple +import email.utils +import ipaddress +import logging +import mailparser # type: ignore +from mailparser.utils import msgconvert # type: ignore from ..exceptions import InvalidMISPObject from .abstractgenerator import AbstractMISPObjectGenerator -from io import BytesIO -import logging -from email import message_from_bytes, policy -import email.utils -from pathlib import Path -from typing import Union + +try: + import magic # type: ignore + import tempfile +except ImportError: + magic = None logger = logging.getLogger('pymisp') class EMailObject(AbstractMISPObjectGenerator): + def __init__(self, filepath: Union[Path, str] = None, pseudofile: BytesIO = None, + attach_original_email: bool = True, **kwargs): + super().__init__("email", **kwargs) - def __init__(self, filepath: Union[Path, str] = None, pseudofile: BytesIO = None, attach_original_email: bool = True, **kwargs): - # PY3 way: - # super().__init__('file') - super(EMailObject, self).__init__('email', **kwargs) + converted = False if filepath: - with open(filepath, 'rb') as f: - self.__pseudofile = BytesIO(f.read()) + if str(filepath).endswith(".msg"): + pseudofile = self.__convert_outlook_msg_format(str(filepath)) + converted = True + else: + with open(filepath, "rb") as f: + pseudofile = BytesIO(f.read()) + elif pseudofile and isinstance(pseudofile, BytesIO): - self.__pseudofile = pseudofile + if magic: + # if python-magic is installed, we can autodetect MS Outlook format + mime = magic.from_buffer(pseudofile.read(2048), mime=True) + pseudofile.seek(0) + if mime == "application/CDFV2": + # save outlook msg file to temporary file + temph, temp = tempfile.mkstemp(prefix="outlook_") + with os.fdopen(temph, "wb") as fdfile: + fdfile.write(pseudofile.getvalue()) + fdfile.close() + pseudofile = self.__convert_outlook_msg_format(temp) + os.unlink(temp) # remove temporary file necessary to convert formats + converted = True + else: - raise InvalidMISPObject('File buffer (BytesIO) or a path is required.') - self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default) - # Improperly encoded emails (utf-8-sig) fail silently. An empty email indicates this might be the case. - if len(self.__email) == 0: - self.attempt_decoding() + raise InvalidMISPObject("File buffer (BytesIO) or a path is required.") + if attach_original_email: - self.add_attribute('eml', value='Full email.eml', data=self.__pseudofile) - self.generate_attributes() + self.add_attribute("eml", value="Full email.eml", data=pseudofile, + comment="Converted from MSG format" if converted else None) + + message = self.attempt_decoding(pseudofile) + self.__parser = mailparser.MailParser(message) + self.__generate_attributes() + + @staticmethod + def __convert_outlook_msg_format(filepath: str) -> BytesIO: + converted_file, _ = msgconvert(filepath) + with open(converted_file, "rb") as f: + pseudofile = BytesIO(f.read()) + os.remove(converted_file) # delete temporary file + return pseudofile + + @staticmethod + def attempt_decoding(bytes_io: BytesIO) -> EmailMessage: + """Attempt to decode different king of emails, for example non-ascii encoded emails.""" + bytes = bytes_io.getvalue() + + message: EmailMessage = email.message_from_bytes(bytes, policy=policy.default) # type: ignore + + if len(message) != 0: + return message + + # Improperly encoded emails (utf-8-sig) fail silently. An empty email indicates this might be the case. + try: + bytes.decode("ASCII") + raise Exception("EmailObject failed to decode ASCII encoded email.") + except UnicodeDecodeError: + logger.debug("EmailObject was passed a non-ASCII encoded binary blob.") + try: + if bytes[:3] == b'\xef\xbb\xbf': # utf-8-sig byte-order mark (BOM) + # Set Pseudofile to correctly encoded email in case it is used at some later point. + bytes = bytes.decode("utf_8_sig").encode("ASCII") + message = email.message_from_bytes(bytes, policy=policy.default) # type: ignore + return message + except UnicodeDecodeError: + pass + + raise Exception( + "EmailObject does not know how to decode binary blob passed to it. Object may not be an email. If this is an email please submit it as an issue to PyMISP so we can add support.") @property - def email(self): - return self.__email + def email(self) -> EmailMessage: + return self.__parser.message @property - def attachments(self): + def attachments(self) -> List[Tuple[str, BytesIO]]: to_return = [] - for attachment in self.__email.iter_attachments(): - content = attachment.get_content() + for attachment in self.email.iter_attachments(): + content = attachment.get_content() # type: ignore if isinstance(content, str): content = content.encode() to_return.append((attachment.get_filename(), BytesIO(content))) return to_return - def attempt_decoding(self): - """Attempt to decode non-ascii encoded emails. - """ - _msg_bytes = self.__pseudofile.getvalue() - try: - _msg_bytes.decode("ASCII") - logger.info("EmailObject failed to decode ASCII encoded email.") - return - except UnicodeDecodeError: - logger.debug("EmailObject was passed a non-ASCII encoded binary blob.") - try: - if _msg_bytes[:3] == b'\xef\xbb\xbf': # utf-8-sig byte-order mark (BOM) - # Set Pseudofile to correctly encoded email in case it is used at some later point. - self.__pseudofile = BytesIO(_msg_bytes.decode('utf_8_sig').encode("ASCII")) - self.__email = message_from_bytes(self.__pseudofile.getvalue(), policy=policy.default) - except UnicodeDecodeError: - logger.debug("EmailObject does not know how to decode binary blob passed to it. Object may not be an email. If this is an email please submit it as an issue to PyMISP so we can add support.") + def __generate_attributes(self): + message = self.email - def generate_attributes(self): - if self.__email.get_body(preferencelist=('html', 'plain')): - self.add_attribute('email-body', value=self.__email.get_body(preferencelist=('html', 'plain')).get_payload(decode=True).decode('utf8', 'surrogateescape')) - if 'Reply-To' in self.__email: - self.add_attribute('reply-to', value=self.__email['Reply-To']) - if 'Message-ID' in self.__email: - self.add_attribute('message-id', value=self.__email['Message-ID']) - if 'To' in self.__email: - self._add_emails('to', self.__email['To']) - if 'Cc' in self.__email: - # TODO: split name and email address - to_add = [to.strip() for to in self.__email['Cc'].split(',')] - self.add_attributes('cc', *to_add) - if 'Subject' in self.__email: - self.add_attribute('subject', value=self.__email['Subject']) - if 'From' in self.__email: - self._add_emails('from', self.__email['From']) - if 'Return-Path' in self.__email: - # TODO: split name and email address - self.add_attribute('return-path', value=self.__email['Return-Path']) - if 'User-Agent' in self.__email: - self.add_attribute('user-agent', value=self.__email['User-Agent']) - if self.__email.get_boundary(): - self.add_attribute('mime-boundary', value=self.__email.get_boundary()) - if 'X-Mailer' in self.__email: - self.add_attribute('x-mailer', value=self.__email['X-Mailer']) - if 'Thread-Index' in self.__email: - self.add_attribute('thread-index', value=self.__email['Thread-Index']) - if 'Date' in self.__email: - self.add_attribute('send-date', value=self._sanitize_timestamp(self.__email['Date'])) - # TODO: email-header: all headers in one bloc - # TODO: BCC? - # TODO: received headers sometimes have TO email addresses + body = message.get_body(preferencelist=("html", "plain")) + if body: + self.add_attribute("email-body", body.get_payload(decode=True).decode('utf8', 'surrogateescape')) - def _add_emails(self, type: str, data: str): - parts = [part.strip() for part in data.split(',')] + headers = ["{}: {}".format(k, v) for k, v in message.items()] + if headers: + self.add_attribute("header", "\n".join(headers)) + + message_date = self.__parser.date + if message_date: + self.add_attribute("send-date", message_date) + + if "To" in message: + self.__add_emails("to", message["To"]) + + if "From" in message: + self.__add_emails("from", message["From"]) + + if "Return-Path" in message: + realname, address = email.utils.parseaddr(message["Return-Path"]) + self.add_attribute("return-path", address) + + if "Reply-To" in message: + realname, address = self.__parser.reply_to[0] + if address and realname: + self.add_attribute("reply-to", value=address, comment=message["Reply-To"]) + elif address: + self.add_attribute("reply-to", address) + else: # invalid format, insert original value + self.add_attribute("reply-to", message["Reply-To"]) + + if "Cc" in message: + self.__add_emails("cc", message["Cc"], insert_display_names=False) + + if "Subject" in message: + self.add_attribute("subject", message["Subject"]) + + if "Message-ID" in message: + self.add_attribute("message-id", message["Message-ID"]) + + if "User-Agent" in message: + self.add_attribute("user-agent", message["User-Agent"]) + + boundary = message.get_boundary() + if boundary: + self.add_attribute("mime-boundary", boundary) + + if "X-Mailer" in message: + self.add_attribute("x-mailer", message["X-Mailer"]) + + if "Thread-Index" in message: + self.add_attribute("thread-index", message["Thread-Index"]) + + self.__generate_received() + + def __add_emails(self, typ: str, data: str, insert_display_names: bool = True): + parts = [part.strip() for part in data.split(",")] addresses = [] display_names = [] + for part in parts: realname, address = email.utils.parseaddr(part) - if address: # parsing failed, insert original value - addresses.push({"value": part}) - else: - addresses.push({"value": address, "comment": part}) + if address and realname: + addresses.append({"value": address, "comment": part}) + elif address: + addresses.append({"value": address}) + else: # parsing failed, insert original value + addresses.append({"value": part}) if realname: - display_names.push({"value": realname, "comment": part}) + display_names.append({"value": realname, "comment": part}) if addresses: - self.add_attributes(type, *addresses) - if display_names: - self.add_attributes("{}-display-name".format(type), *display_names) + self.add_attributes(typ, *addresses) + if insert_display_names and display_names: + self.add_attributes("{}-display-name".format(typ), *display_names) + def __generate_received(self): + """ + Extract IP addresses from received headers that are not private. + """ + for received in self.__parser.received: + tokens = received["from"].split(" ") + ip = None + for token in tokens: + try: + ip = ipaddress.ip_address(token) + break + except ValueError: + pass # token is not IP address + + if not ip or ip.is_private: + continue # skip header if IP not found or is private + + self.add_attribute("received-header-ip", value=str(ip), comment=received["from"]) diff --git a/pyproject.toml b/pyproject.toml index d8b4890..96d4085 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ requests = "^2.22.0" python-dateutil = "^2.8.1" jsonschema = "^3.2.0" deprecated = "^1.2.7" +mail-parser = {version = "3.12.0"} python-magic = {version = "^0.4.15", optional = true} pydeep = {version = "^0.4", optional = true} lief = {version = "^0.10.1", optional = true}